From db5e523fddd2a1a47d9ea63498734d0141925513 Mon Sep 17 00:00:00 2001 From: "Shawn O. Pearce" Date: Sat, 5 Aug 2006 02:04:21 -0400 Subject: [PATCH 01/81] Created fast-import, a tool to quickly generating a pack from blobs. Signed-off-by: Shawn O. Pearce --- .gitignore | 1 + Makefile | 1 + fast-import.c | 214 ++++++++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 216 insertions(+) create mode 100644 fast-import.c diff --git a/.gitignore b/.gitignore index 55cd9844d6..8ddccd7dac 100644 --- a/.gitignore +++ b/.gitignore @@ -32,6 +32,7 @@ git-diff-index git-diff-stages git-diff-tree git-describe +git-fast-import git-fetch git-fetch-pack git-findtags diff --git a/Makefile b/Makefile index b15b420ea2..a37f74a1ff 100644 --- a/Makefile +++ b/Makefile @@ -186,6 +186,7 @@ SIMPLE_PROGRAMS = \ PROGRAMS = \ git-convert-objects$X git-fetch-pack$X git-fsck-objects$X \ git-hash-object$X git-index-pack$X git-local-fetch$X \ + git-fast-import$X \ git-merge-base$X \ git-merge-index$X git-mktag$X git-mktree$X git-patch-id$X \ git-peek-remote$X git-receive-pack$X \ diff --git a/fast-import.c b/fast-import.c new file mode 100644 index 0000000000..416ba5c7c6 --- /dev/null +++ b/fast-import.c @@ -0,0 +1,214 @@ +#include "builtin.h" +#include "cache.h" +#include "object.h" +#include "blob.h" +#include "delta.h" +#include "pack.h" +#include "csum-file.h" + +static int max_depth = 10; +static unsigned long object_count; +static int packfd; +static int current_depth; +static void *lastdat; +static unsigned long lastdatlen; +static unsigned char lastsha1[20]; + +static ssize_t yread(int fd, void *buffer, size_t length) +{ + ssize_t ret = 0; + while (ret < length) { + ssize_t size = xread(fd, (char *) buffer + ret, length - ret); + if (size < 0) { + return size; + } + if (size == 0) { + return ret; + } + ret += size; + } + return ret; +} + +static ssize_t ywrite(int fd, void *buffer, size_t length) +{ + ssize_t ret = 0; + while (ret < length) { + ssize_t size = xwrite(fd, (char *) buffer + ret, length - ret); + if (size < 0) { + return size; + } + if (size == 0) { + return ret; + } + ret += size; + } + return ret; +} + +static unsigned long encode_header(enum object_type type, unsigned long size, unsigned char *hdr) +{ + int n = 1; + unsigned char c; + + if (type < OBJ_COMMIT || type > OBJ_DELTA) + die("bad type %d", type); + + c = (type << 4) | (size & 15); + size >>= 4; + while (size) { + *hdr++ = c | 0x80; + c = size & 0x7f; + size >>= 7; + n++; + } + *hdr = c; + return n; +} + +static void write_blob (void *dat, unsigned long datlen) +{ + z_stream s; + void *out, *delta; + unsigned char hdr[64]; + unsigned long hdrlen, deltalen; + + if (lastdat && current_depth < max_depth) { + delta = diff_delta(lastdat, lastdatlen, + dat, datlen, + &deltalen, 0); + } else + delta = 0; + + memset(&s, 0, sizeof(s)); + deflateInit(&s, zlib_compression_level); + + if (delta) { + current_depth++; + s.next_in = delta; + s.avail_in = deltalen; + hdrlen = encode_header(OBJ_DELTA, deltalen, hdr); + if (ywrite(packfd, hdr, hdrlen) != hdrlen) + die("Can't write object header: %s", strerror(errno)); + if (ywrite(packfd, lastsha1, sizeof(lastsha1)) != sizeof(lastsha1)) + die("Can't write object base: %s", strerror(errno)); + } else { + current_depth = 0; + s.next_in = dat; + s.avail_in = datlen; + hdrlen = encode_header(OBJ_BLOB, datlen, hdr); + if (ywrite(packfd, hdr, hdrlen) != hdrlen) + die("Can't write object header: %s", strerror(errno)); + } + + s.avail_out = deflateBound(&s, s.avail_in); + s.next_out = out = xmalloc(s.avail_out); + while (deflate(&s, Z_FINISH) == Z_OK) + /* nothing */; + deflateEnd(&s); + + if (ywrite(packfd, out, s.total_out) != s.total_out) + die("Failed writing compressed data %s", strerror(errno)); + + free(out); + if (delta) + free(delta); +} + +static void init_pack_header () +{ + const char* magic = "PACK"; + unsigned long version = 2; + unsigned long zero = 0; + + version = htonl(version); + + if (ywrite(packfd, (char*)magic, 4) != 4) + die("Can't write pack magic: %s", strerror(errno)); + if (ywrite(packfd, &version, 4) != 4) + die("Can't write pack version: %s", strerror(errno)); + if (ywrite(packfd, &zero, 4) != 4) + die("Can't write 0 object count: %s", strerror(errno)); +} + +static void fixup_header_footer () +{ + SHA_CTX c; + char hdr[8]; + unsigned char sha1[20]; + unsigned long cnt; + char *buf; + size_t n; + + if (lseek(packfd, 0, SEEK_SET) != 0) + die("Failed seeking to start: %s", strerror(errno)); + + SHA1_Init(&c); + if (yread(packfd, hdr, 8) != 8) + die("Failed reading header: %s", strerror(errno)); + SHA1_Update(&c, hdr, 8); + +fprintf(stderr, "%lu objects\n", object_count); + cnt = htonl(object_count); + SHA1_Update(&c, &cnt, 4); + if (ywrite(packfd, &cnt, 4) != 4) + die("Failed writing object count: %s", strerror(errno)); + + buf = xmalloc(128 * 1024); + for (;;) { + n = xread(packfd, buf, 128 * 1024); + if (n <= 0) + break; + SHA1_Update(&c, buf, n); + } + free(buf); + + SHA1_Final(sha1, &c); + if (ywrite(packfd, sha1, sizeof(sha1)) != sizeof(sha1)) + die("Failed writing pack checksum: %s", strerror(errno)); +} + +int main (int argc, const char **argv) +{ + packfd = open(argv[1], O_RDWR|O_CREAT|O_TRUNC, 0666); + if (packfd < 0) + die("Can't create pack file %s: %s", argv[1], strerror(errno)); + + init_pack_header(); + for (;;) { + unsigned long datlen; + int hdrlen; + void *dat; + char hdr[128]; + unsigned char sha1[20]; + SHA_CTX c; + + if (yread(0, &datlen, 4) != 4) + break; + + dat = xmalloc(datlen); + if (yread(0, dat, datlen) != datlen) + break; + + hdrlen = sprintf(hdr, "blob %lu", datlen) + 1; + SHA1_Init(&c); + SHA1_Update(&c, hdr, hdrlen); + SHA1_Update(&c, dat, datlen); + SHA1_Final(sha1, &c); + + write_blob(dat, datlen); + object_count++; + printf("%s\n", sha1_to_hex(sha1)); + fflush(stdout); + + if (lastdat) + free(lastdat); + lastdat = dat; + lastdatlen = datlen; + memcpy(lastsha1, sha1, sizeof(sha1)); + } + fixup_header_footer(); + close(packfd); + + return 0; +} From 8bcce30126b90af83c1291e072f74950e73a2584 Mon Sep 17 00:00:00 2001 From: "Shawn O. Pearce" Date: Sun, 6 Aug 2006 13:51:39 -0400 Subject: [PATCH 02/81] Added automatic index generation to fast-import. Signed-off-by: Shawn O. Pearce --- fast-import.c | 182 ++++++++++++++++++++++++++++++++++++++++++++------ 1 file changed, 163 insertions(+), 19 deletions(-) diff --git a/fast-import.c b/fast-import.c index 416ba5c7c6..0d95118499 100644 --- a/fast-import.c +++ b/fast-import.c @@ -8,11 +8,75 @@ static int max_depth = 10; static unsigned long object_count; +static unsigned long duplicate_count; +static unsigned long packoff; +static unsigned long overflow_count; static int packfd; static int current_depth; static void *lastdat; static unsigned long lastdatlen; static unsigned char lastsha1[20]; +static unsigned char packsha1[20]; + +struct object_entry +{ + struct object_entry *next; + unsigned long offset; + unsigned char sha1[20]; +}; + +struct overflow_object_entry +{ + struct overflow_object_entry *next; + struct object_entry oe; +}; + +struct object_entry *pool_start; +struct object_entry *pool_next; +struct object_entry *pool_end; +struct overflow_object_entry *overflow; +struct object_entry *table[1 << 16]; + +static struct object_entry* new_object(unsigned char *sha1) +{ + if (pool_next != pool_end) { + struct object_entry *e = pool_next++; + memcpy(e->sha1, sha1, sizeof(e->sha1)); + return e; + } else { + struct overflow_object_entry *e; + + e = xmalloc(sizeof(struct overflow_object_entry)); + e->next = overflow; + memcpy(e->oe.sha1, sha1, sizeof(e->oe.sha1)); + overflow = e; + overflow_count++; + return &e->oe; + } +} + +static struct object_entry* insert_object(unsigned char *sha1) +{ + unsigned int h = sha1[0] << 8 | sha1[1]; + struct object_entry *e = table[h]; + struct object_entry *p = 0; + + while (e) { + if (!memcmp(sha1, e->sha1, sizeof(e->sha1))) + return e; + p = e; + e = e->next; + } + + e = new_object(sha1); + e->next = 0; + e->offset = 0; + if (p) + p->next = e; + else + table[h] = e; + return e; +} static ssize_t yread(int fd, void *buffer, size_t length) { @@ -66,7 +130,7 @@ static unsigned long encode_header(enum object_type type, unsigned long size, un return n; } -static void write_blob (void *dat, unsigned long datlen) +static void write_blob(void *dat, unsigned long datlen) { z_stream s; void *out, *delta; @@ -92,6 +156,7 @@ static void write_blob (void *dat, unsigned long datlen) die("Can't write object header: %s", strerror(errno)); if (ywrite(packfd, lastsha1, sizeof(lastsha1)) != sizeof(lastsha1)) die("Can't write object base: %s", strerror(errno)); + packoff += hdrlen + sizeof(lastsha1); } else { current_depth = 0; s.next_in = dat; @@ -99,6 +164,7 @@ static void write_blob (void *dat, unsigned long datlen) hdrlen = encode_header(OBJ_BLOB, datlen, hdr); if (ywrite(packfd, hdr, hdrlen) != hdrlen) die("Can't write object header: %s", strerror(errno)); + packoff += hdrlen; } s.avail_out = deflateBound(&s, s.avail_in); @@ -109,13 +175,14 @@ static void write_blob (void *dat, unsigned long datlen) if (ywrite(packfd, out, s.total_out) != s.total_out) die("Failed writing compressed data %s", strerror(errno)); + packoff += s.total_out; free(out); if (delta) free(delta); } -static void init_pack_header () +static void init_pack_header() { const char* magic = "PACK"; unsigned long version = 2; @@ -129,13 +196,13 @@ static void init_pack_header () die("Can't write pack version: %s", strerror(errno)); if (ywrite(packfd, &zero, 4) != 4) die("Can't write 0 object count: %s", strerror(errno)); + packoff = 4 * 3; } -static void fixup_header_footer () +static void fixup_header_footer() { SHA_CTX c; char hdr[8]; - unsigned char sha1[20]; unsigned long cnt; char *buf; size_t n; @@ -148,7 +215,6 @@ static void fixup_header_footer () die("Failed reading header: %s", strerror(errno)); SHA1_Update(&c, hdr, 8); -fprintf(stderr, "%lu objects\n", object_count); cnt = htonl(object_count); SHA1_Update(&c, &cnt, 4); if (ywrite(packfd, &cnt, 4) != 4) @@ -163,16 +229,81 @@ fprintf(stderr, "%lu objects\n", object_count); } free(buf); - SHA1_Final(sha1, &c); - if (ywrite(packfd, sha1, sizeof(sha1)) != sizeof(sha1)) + SHA1_Final(packsha1, &c); + if (ywrite(packfd, packsha1, sizeof(packsha1)) != sizeof(packsha1)) die("Failed writing pack checksum: %s", strerror(errno)); } -int main (int argc, const char **argv) +static int oecmp (const void *_a, const void *_b) { - packfd = open(argv[1], O_RDWR|O_CREAT|O_TRUNC, 0666); + struct object_entry *a = *((struct object_entry**)_a); + struct object_entry *b = *((struct object_entry**)_b); + return memcmp(a->sha1, b->sha1, sizeof(a->sha1)); +} + +static void write_index(const char *idx_name) +{ + struct sha1file *f; + struct object_entry **idx, **c, **last; + struct object_entry *e; + struct overflow_object_entry *o; + unsigned int array[256]; + int i; + + /* Build the sorted table of object IDs. */ + idx = xmalloc(object_count * sizeof(struct object_entry*)); + c = idx; + for (e = pool_start; e != pool_next; e++) + *c++ = e; + for (o = overflow; o; o = o->next) + *c++ = &o->oe; + last = idx + object_count; + qsort(idx, object_count, sizeof(struct object_entry*), oecmp); + + /* Generate the fan-out array. */ + c = idx; + for (i = 0; i < 256; i++) { + struct object_entry **next = c;; + while (next < last) { + if ((*next)->sha1[0] != i) + break; + next++; + } + array[i] = htonl(next - idx); + c = next; + } + + f = sha1create("%s", idx_name); + sha1write(f, array, 256 * sizeof(int)); + for (c = idx; c != last; c++) { + unsigned int offset = htonl((*c)->offset); + sha1write(f, &offset, 4); + sha1write(f, (*c)->sha1, sizeof((*c)->sha1)); + } + sha1write(f, packsha1, sizeof(packsha1)); + sha1close(f, NULL, 1); + free(idx); +} + +int main(int argc, const char **argv) +{ + const char *base_name = argv[1]; + int est_obj_cnt = atoi(argv[2]); + char *pack_name; + char *idx_name; + + pack_name = xmalloc(strlen(base_name) + 6); + sprintf(pack_name, "%s.pack", base_name); + idx_name = xmalloc(strlen(base_name) + 5); + sprintf(idx_name, "%s.idx", base_name); + + packfd = open(pack_name, O_RDWR|O_CREAT|O_TRUNC, 0666); if (packfd < 0) - die("Can't create pack file %s: %s", argv[1], strerror(errno)); + die("Can't create pack file %s: %s", pack_name, strerror(errno)); + + pool_start = xmalloc(est_obj_cnt * sizeof(struct object_entry)); + pool_next = pool_start; + pool_end = pool_start + est_obj_cnt; init_pack_header(); for (;;) { @@ -182,8 +313,10 @@ int main (int argc, const char **argv) char hdr[128]; unsigned char sha1[20]; SHA_CTX c; + struct object_entry *e; if (yread(0, &datlen, 4) != 4) + break; dat = xmalloc(datlen); @@ -196,19 +329,30 @@ int main (int argc, const char **argv) SHA1_Update(&c, dat, datlen); SHA1_Final(sha1, &c); - write_blob(dat, datlen); - object_count++; - printf("%s\n", sha1_to_hex(sha1)); - fflush(stdout); + e = insert_object(sha1); + if (!e->offset) { + e->offset = packoff; + write_blob(dat, datlen); + object_count++; + printf("%s\n", sha1_to_hex(sha1)); + fflush(stdout); - if (lastdat) - free(lastdat); - lastdat = dat; - lastdatlen = datlen; - memcpy(lastsha1, sha1, sizeof(sha1)); + if (lastdat) + free(lastdat); + lastdat = dat; + lastdatlen = datlen; + memcpy(lastsha1, sha1, sizeof(sha1)); + } else { + duplicate_count++; + free(dat); + } } fixup_header_footer(); close(packfd); + write_index(idx_name); + + fprintf(stderr, "%lu objects, %lu duplicates, %lu pool overflow\n", + object_count, duplicate_count, overflow_count); return 0; } From 27d6d29035473f01ba5bb3b52c86ee4181d251fe Mon Sep 17 00:00:00 2001 From: "Shawn O. Pearce" Date: Tue, 8 Aug 2006 00:03:59 -0400 Subject: [PATCH 03/81] Cleaned up memory allocation for object_entry structs. Although its easy to ask the user to tell us how many objects they will need, its probably better to dynamically grow the object table in large units. But if the user can give us a hint as to roughly how many objects then we can still use it during startup. Also stopped printing the SHA1 strings to stdout as no user is currently making use of that facility. Signed-off-by: Shawn O. Pearce --- fast-import.c | 97 +++++++++++++++++++++++++-------------------------- 1 file changed, 48 insertions(+), 49 deletions(-) diff --git a/fast-import.c b/fast-import.c index 0d95118499..3856c87c4e 100644 --- a/fast-import.c +++ b/fast-import.c @@ -6,18 +6,6 @@ #include "pack.h" #include "csum-file.h" -static int max_depth = 10; -static unsigned long object_count; -static unsigned long duplicate_count; -static unsigned long packoff; -static unsigned long overflow_count; -static int packfd; -static int current_depth; -static void *lastdat; -static unsigned long lastdatlen; -static unsigned char lastsha1[20]; -static unsigned char packsha1[20]; - struct object_entry { struct object_entry *next; @@ -25,40 +13,57 @@ struct object_entry unsigned char sha1[20]; }; -struct overflow_object_entry +struct object_entry_block { - struct overflow_object_entry *next; - struct object_entry oe; + struct object_entry_block *next_block; + struct object_entry *next_free; + struct object_entry *end; + struct object_entry entries[0]; }; -struct object_entry *pool_start; -struct object_entry *pool_next; -struct object_entry *pool_end; -struct overflow_object_entry *overflow; -struct object_entry *table[1 << 16]; +static int max_depth = 10; +static unsigned long alloc_count; +static unsigned long object_count; +static unsigned long duplicate_count; +static unsigned long packoff; +static int packfd; +static int current_depth; +static void *lastdat; +static unsigned long lastdatlen; +static unsigned char lastsha1[20]; +static unsigned char packsha1[20]; +struct object_entry *object_table[1 << 16]; +struct object_entry_block *blocks; + +static void alloc_objects(int cnt) +{ + struct object_entry_block *b; + + b = xmalloc(sizeof(struct object_entry_block) + + cnt * sizeof(struct object_entry)); + b->next_block = blocks; + b->next_free = b->entries; + b->end = b->entries + cnt; + blocks = b; + alloc_count += cnt; +} static struct object_entry* new_object(unsigned char *sha1) { - if (pool_next != pool_end) { - struct object_entry *e = pool_next++; - memcpy(e->sha1, sha1, sizeof(e->sha1)); - return e; - } else { - struct overflow_object_entry *e; + struct object_entry *e; - e = xmalloc(sizeof(struct overflow_object_entry)); - e->next = overflow; - memcpy(e->oe.sha1, sha1, sizeof(e->oe.sha1)); - overflow = e; - overflow_count++; - return &e->oe; - } + if (blocks->next_free == blocks->end) + alloc_objects(1000); + + e = blocks->next_free++; + memcpy(e->sha1, sha1, sizeof(e->sha1)); + return e; } static struct object_entry* insert_object(unsigned char *sha1) { unsigned int h = sha1[0] << 8 | sha1[1]; - struct object_entry *e = table[h]; + struct object_entry *e = object_table[h]; struct object_entry *p = 0; while (e) { @@ -74,7 +79,7 @@ static struct object_entry* insert_object(unsigned char *sha1) if (p) p->next = e; else - table[h] = e; + object_table[h] = e; return e; } @@ -246,17 +251,16 @@ static void write_index(const char *idx_name) struct sha1file *f; struct object_entry **idx, **c, **last; struct object_entry *e; - struct overflow_object_entry *o; + struct object_entry_block *o; unsigned int array[256]; int i; /* Build the sorted table of object IDs. */ idx = xmalloc(object_count * sizeof(struct object_entry*)); c = idx; - for (e = pool_start; e != pool_next; e++) - *c++ = e; - for (o = overflow; o; o = o->next) - *c++ = &o->oe; + for (o = blocks; o; o = o->next_block) + for (e = o->entries; e != o->next_free; e++) + *c++ = e; last = idx + object_count; qsort(idx, object_count, sizeof(struct object_entry*), oecmp); @@ -297,14 +301,11 @@ int main(int argc, const char **argv) idx_name = xmalloc(strlen(base_name) + 5); sprintf(idx_name, "%s.idx", base_name); - packfd = open(pack_name, O_RDWR|O_CREAT|O_TRUNC, 0666); + packfd = open(pack_name, O_RDWR|O_CREAT|O_EXCL, 0666); if (packfd < 0) die("Can't create pack file %s: %s", pack_name, strerror(errno)); - pool_start = xmalloc(est_obj_cnt * sizeof(struct object_entry)); - pool_next = pool_start; - pool_end = pool_start + est_obj_cnt; - + alloc_objects(est_obj_cnt); init_pack_header(); for (;;) { unsigned long datlen; @@ -334,8 +335,6 @@ int main(int argc, const char **argv) e->offset = packoff; write_blob(dat, datlen); object_count++; - printf("%s\n", sha1_to_hex(sha1)); - fflush(stdout); if (lastdat) free(lastdat); @@ -351,8 +350,8 @@ int main(int argc, const char **argv) close(packfd); write_index(idx_name); - fprintf(stderr, "%lu objects, %lu duplicates, %lu pool overflow\n", - object_count, duplicate_count, overflow_count); + fprintf(stderr, "%lu objects, %lu duplicates, %lu allocated (%lu overflow)\n", + object_count, duplicate_count, alloc_count, alloc_count - est_obj_cnt); return 0; } From ac47a738a7c866eeffc0c6374c0ef3f7ca6ee79d Mon Sep 17 00:00:00 2001 From: "Shawn O. Pearce" Date: Tue, 8 Aug 2006 00:46:13 -0400 Subject: [PATCH 04/81] Refactored fast-import's internals for future additions. Too many globals variables were being used not not enough code was resuable to process trees and commits so this is a simple refactoring of the existing blob processing code to get into a state that will be easier to handle trees and commits in. Signed-off-by: Shawn O. Pearce --- fast-import.c | 149 ++++++++++++++++++++++++++++---------------------- 1 file changed, 83 insertions(+), 66 deletions(-) diff --git a/fast-import.c b/fast-import.c index 3856c87c4e..8b4be28f60 100644 --- a/fast-import.c +++ b/fast-import.c @@ -18,22 +18,34 @@ struct object_entry_block struct object_entry_block *next_block; struct object_entry *next_free; struct object_entry *end; - struct object_entry entries[0]; + struct object_entry entries[FLEX_ARRAY]; /* more */ }; +struct last_object +{ + void *data; + unsigned long len; + int depth; + unsigned char sha1[20]; +}; + +/* Stats and misc. counters. */ static int max_depth = 10; static unsigned long alloc_count; static unsigned long object_count; static unsigned long duplicate_count; -static unsigned long packoff; -static int packfd; -static int current_depth; -static void *lastdat; -static unsigned long lastdatlen; -static unsigned char lastsha1[20]; -static unsigned char packsha1[20]; -struct object_entry *object_table[1 << 16]; + +/* The .pack file */ +static int pack_fd; +static unsigned long pack_offset; +static unsigned char pack_sha1[20]; + +/* Table of objects we've written. */ struct object_entry_block *blocks; +struct object_entry *object_table[1 << 16]; + +/* Our last blob */ +struct last_object last_blob; static void alloc_objects(int cnt) { @@ -115,7 +127,10 @@ static ssize_t ywrite(int fd, void *buffer, size_t length) return ret; } -static unsigned long encode_header(enum object_type type, unsigned long size, unsigned char *hdr) +static unsigned long encode_header( + enum object_type type, + unsigned long size, + unsigned char *hdr) { int n = 1; unsigned char c; @@ -135,41 +150,62 @@ static unsigned long encode_header(enum object_type type, unsigned long size, un return n; } -static void write_blob(void *dat, unsigned long datlen) +static int store_object( + enum object_type type, + void *dat, + unsigned long datlen, + struct last_object *last) { - z_stream s; void *out, *delta; - unsigned char hdr[64]; + struct object_entry *e; + unsigned char hdr[96]; + unsigned char sha1[20]; unsigned long hdrlen, deltalen; + SHA_CTX c; + z_stream s; - if (lastdat && current_depth < max_depth) { - delta = diff_delta(lastdat, lastdatlen, + hdrlen = sprintf((char*)hdr,"%s %lu",type_names[type],datlen) + 1; + SHA1_Init(&c); + SHA1_Update(&c, hdr, hdrlen); + SHA1_Update(&c, dat, datlen); + SHA1_Final(sha1, &c); + + e = insert_object(sha1); + if (e->offset) { + duplicate_count++; + return 0; + } + e->offset = pack_offset; + object_count++; + + if (last->data && last->depth < max_depth) + delta = diff_delta(last->data, last->len, dat, datlen, &deltalen, 0); - } else + else delta = 0; memset(&s, 0, sizeof(s)); deflateInit(&s, zlib_compression_level); if (delta) { - current_depth++; + last->depth++; s.next_in = delta; s.avail_in = deltalen; hdrlen = encode_header(OBJ_DELTA, deltalen, hdr); - if (ywrite(packfd, hdr, hdrlen) != hdrlen) + if (ywrite(pack_fd, hdr, hdrlen) != hdrlen) die("Can't write object header: %s", strerror(errno)); - if (ywrite(packfd, lastsha1, sizeof(lastsha1)) != sizeof(lastsha1)) + if (ywrite(pack_fd, last->sha1, sizeof(sha1)) != sizeof(sha1)) die("Can't write object base: %s", strerror(errno)); - packoff += hdrlen + sizeof(lastsha1); + pack_offset += hdrlen + sizeof(sha1); } else { - current_depth = 0; + last->depth = 0; s.next_in = dat; s.avail_in = datlen; - hdrlen = encode_header(OBJ_BLOB, datlen, hdr); - if (ywrite(packfd, hdr, hdrlen) != hdrlen) + hdrlen = encode_header(type, datlen, hdr); + if (ywrite(pack_fd, hdr, hdrlen) != hdrlen) die("Can't write object header: %s", strerror(errno)); - packoff += hdrlen; + pack_offset += hdrlen; } s.avail_out = deflateBound(&s, s.avail_in); @@ -178,13 +214,19 @@ static void write_blob(void *dat, unsigned long datlen) /* nothing */; deflateEnd(&s); - if (ywrite(packfd, out, s.total_out) != s.total_out) + if (ywrite(pack_fd, out, s.total_out) != s.total_out) die("Failed writing compressed data %s", strerror(errno)); - packoff += s.total_out; + pack_offset += s.total_out; free(out); if (delta) free(delta); + if (last->data) + free(last->data); + last->data = dat; + last->len = datlen; + memcpy(last->sha1, sha1, sizeof(sha1)); + return 1; } static void init_pack_header() @@ -195,13 +237,13 @@ static void init_pack_header() version = htonl(version); - if (ywrite(packfd, (char*)magic, 4) != 4) + if (ywrite(pack_fd, (char*)magic, 4) != 4) die("Can't write pack magic: %s", strerror(errno)); - if (ywrite(packfd, &version, 4) != 4) + if (ywrite(pack_fd, &version, 4) != 4) die("Can't write pack version: %s", strerror(errno)); - if (ywrite(packfd, &zero, 4) != 4) + if (ywrite(pack_fd, &zero, 4) != 4) die("Can't write 0 object count: %s", strerror(errno)); - packoff = 4 * 3; + pack_offset = 4 * 3; } static void fixup_header_footer() @@ -212,30 +254,30 @@ static void fixup_header_footer() char *buf; size_t n; - if (lseek(packfd, 0, SEEK_SET) != 0) + if (lseek(pack_fd, 0, SEEK_SET) != 0) die("Failed seeking to start: %s", strerror(errno)); SHA1_Init(&c); - if (yread(packfd, hdr, 8) != 8) + if (yread(pack_fd, hdr, 8) != 8) die("Failed reading header: %s", strerror(errno)); SHA1_Update(&c, hdr, 8); cnt = htonl(object_count); SHA1_Update(&c, &cnt, 4); - if (ywrite(packfd, &cnt, 4) != 4) + if (ywrite(pack_fd, &cnt, 4) != 4) die("Failed writing object count: %s", strerror(errno)); buf = xmalloc(128 * 1024); for (;;) { - n = xread(packfd, buf, 128 * 1024); + n = xread(pack_fd, buf, 128 * 1024); if (n <= 0) break; SHA1_Update(&c, buf, n); } free(buf); - SHA1_Final(packsha1, &c); - if (ywrite(packfd, packsha1, sizeof(packsha1)) != sizeof(packsha1)) + SHA1_Final(pack_sha1, &c); + if (ywrite(pack_fd, pack_sha1, sizeof(pack_sha1)) != sizeof(pack_sha1)) die("Failed writing pack checksum: %s", strerror(errno)); } @@ -284,7 +326,7 @@ static void write_index(const char *idx_name) sha1write(f, &offset, 4); sha1write(f, (*c)->sha1, sizeof((*c)->sha1)); } - sha1write(f, packsha1, sizeof(packsha1)); + sha1write(f, pack_sha1, sizeof(pack_sha1)); sha1close(f, NULL, 1); free(idx); } @@ -301,53 +343,28 @@ int main(int argc, const char **argv) idx_name = xmalloc(strlen(base_name) + 5); sprintf(idx_name, "%s.idx", base_name); - packfd = open(pack_name, O_RDWR|O_CREAT|O_EXCL, 0666); - if (packfd < 0) + pack_fd = open(pack_name, O_RDWR|O_CREAT|O_EXCL, 0666); + if (pack_fd < 0) die("Can't create pack file %s: %s", pack_name, strerror(errno)); alloc_objects(est_obj_cnt); init_pack_header(); for (;;) { unsigned long datlen; - int hdrlen; void *dat; - char hdr[128]; - unsigned char sha1[20]; - SHA_CTX c; - struct object_entry *e; if (yread(0, &datlen, 4) != 4) - break; dat = xmalloc(datlen); if (yread(0, dat, datlen) != datlen) break; - hdrlen = sprintf(hdr, "blob %lu", datlen) + 1; - SHA1_Init(&c); - SHA1_Update(&c, hdr, hdrlen); - SHA1_Update(&c, dat, datlen); - SHA1_Final(sha1, &c); - - e = insert_object(sha1); - if (!e->offset) { - e->offset = packoff; - write_blob(dat, datlen); - object_count++; - - if (lastdat) - free(lastdat); - lastdat = dat; - lastdatlen = datlen; - memcpy(lastsha1, sha1, sizeof(sha1)); - } else { - duplicate_count++; + if (!store_object(OBJ_BLOB, dat, datlen, &last_blob)) free(dat); - } } fixup_header_footer(); - close(packfd); + close(pack_fd); write_index(idx_name); fprintf(stderr, "%lu objects, %lu duplicates, %lu allocated (%lu overflow)\n", From 6143f0644e79686407c1dc0e1b4dadff74e80046 Mon Sep 17 00:00:00 2001 From: "Shawn O. Pearce" Date: Tue, 8 Aug 2006 01:14:21 -0400 Subject: [PATCH 05/81] Added basic command handler to fast-import. Moved the new_blob logic off into a new subroutine and invoked it when getting the 'blob' command. Added statistics dump to STDERR when the program terminates listing what it did at a high level. This is somewhat interesting. Signed-off-by: Shawn O. Pearce --- fast-import.c | 60 +++++++++++++++++++++++++++++++++++++++------------ 1 file changed, 46 insertions(+), 14 deletions(-) diff --git a/fast-import.c b/fast-import.c index 8b4be28f60..c9c48c5869 100644 --- a/fast-import.c +++ b/fast-import.c @@ -34,6 +34,8 @@ static int max_depth = 10; static unsigned long alloc_count; static unsigned long object_count; static unsigned long duplicate_count; +static unsigned long object_count_by_type[9]; +static unsigned long duplicate_count_by_type[9]; /* The .pack file */ static int pack_fd; @@ -173,10 +175,12 @@ static int store_object( e = insert_object(sha1); if (e->offset) { duplicate_count++; + duplicate_count_by_type[type]++; return 0; } e->offset = pack_offset; object_count++; + object_count_by_type[type]++; if (last->data && last->depth < max_depth) delta = diff_delta(last->data, last->len, @@ -232,7 +236,7 @@ static int store_object( static void init_pack_header() { const char* magic = "PACK"; - unsigned long version = 2; + unsigned long version = 3; unsigned long zero = 0; version = htonl(version); @@ -331,12 +335,29 @@ static void write_index(const char *idx_name) free(idx); } +static void new_blob() +{ + unsigned long datlen; + void *dat; + + if (yread(0, &datlen, 4) != 4) + die("Can't obtain blob length"); + + dat = xmalloc(datlen); + if (yread(0, dat, datlen) != datlen) + die("Con't obtain %lu bytes of blob data", datlen); + + if (!store_object(OBJ_BLOB, dat, datlen, &last_blob)) + free(dat); +} + int main(int argc, const char **argv) { const char *base_name = argv[1]; int est_obj_cnt = atoi(argv[2]); char *pack_name; char *idx_name; + struct stat sb; pack_name = xmalloc(strlen(base_name) + 6); sprintf(pack_name, "%s.pack", base_name); @@ -345,30 +366,41 @@ int main(int argc, const char **argv) pack_fd = open(pack_name, O_RDWR|O_CREAT|O_EXCL, 0666); if (pack_fd < 0) - die("Can't create pack file %s: %s", pack_name, strerror(errno)); + die("Can't create %s: %s", pack_name, strerror(errno)); alloc_objects(est_obj_cnt); init_pack_header(); for (;;) { - unsigned long datlen; - void *dat; - - if (yread(0, &datlen, 4) != 4) + unsigned long cmd; + if (yread(0, &cmd, 4) != 4) break; - dat = xmalloc(datlen); - if (yread(0, dat, datlen) != datlen) - break; - - if (!store_object(OBJ_BLOB, dat, datlen, &last_blob)) - free(dat); + switch (cmd) { + case 'blob': new_blob(); break; + default: + die("Invalid command %lu", cmd); + } } fixup_header_footer(); close(pack_fd); write_index(idx_name); - fprintf(stderr, "%lu objects, %lu duplicates, %lu allocated (%lu overflow)\n", - object_count, duplicate_count, alloc_count, alloc_count - est_obj_cnt); + fprintf(stderr, "%s statistics:\n", argv[0]); + fprintf(stderr, "---------------------------------------------------\n"); + fprintf(stderr, "Alloc'd objects: %10lu (%10lu overflow )\n", alloc_count, alloc_count - est_obj_cnt); + fprintf(stderr, "Total objects: %10lu (%10lu duplicates)\n", object_count, duplicate_count); + fprintf(stderr, " blobs : %10lu (%10lu duplicates)\n", object_count_by_type[OBJ_BLOB], duplicate_count_by_type[OBJ_BLOB]); + fprintf(stderr, " trees : %10lu (%10lu duplicates)\n", object_count_by_type[OBJ_TREE], duplicate_count_by_type[OBJ_TREE]); + fprintf(stderr, " commits: %10lu (%10lu duplicates)\n", object_count_by_type[OBJ_COMMIT], duplicate_count_by_type[OBJ_COMMIT]); + fprintf(stderr, " tags : %10lu (%10lu duplicates)\n", object_count_by_type[OBJ_TAG], duplicate_count_by_type[OBJ_TAG]); + fprintf(stderr, "---------------------------------------------------\n"); + + stat(pack_name, &sb); + fprintf(stderr, "Pack size: %10lu KiB\n", (unsigned long)(sb.st_size/1024)); + stat(idx_name, &sb); + fprintf(stderr, "Index size: %10lu KiB\n", (unsigned long)(sb.st_size/1024)); + + fprintf(stderr, "\n"); return 0; } From 6bb5b3291df99bf050c91ab742b406d2404b8f73 Mon Sep 17 00:00:00 2001 From: "Shawn O. Pearce" Date: Tue, 8 Aug 2006 03:36:45 -0400 Subject: [PATCH 06/81] Implemented branch handling and basic tree support in fast-import. This provides the basic data structures needed to store trees in memory while we are processing them for a branch. What we are attempting to do is track one complete tree for each branch that the frontend has registered with us through the 'newb' (new_branch) command. When the frontend edits that tree through 'updf' or 'delf' commands we'll mark the affected tree(s) as being dirty and recompute their objects during 'comt' (commit). Currently the protocol is decidedly _not_ user friendly. I crashed fast-import by giving it bad input data from Perl. I may try to improve upon it, or at least upon its error handling. Signed-off-by: Shawn O. Pearce --- fast-import.c | 170 ++++++++++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 165 insertions(+), 5 deletions(-) diff --git a/fast-import.c b/fast-import.c index c9c48c5869..98c5d1cbdd 100644 --- a/fast-import.c +++ b/fast-import.c @@ -24,14 +24,39 @@ struct object_entry_block struct last_object { void *data; - unsigned long len; - int depth; + unsigned int len; + unsigned int depth; unsigned char sha1[20]; }; +struct tree; +struct tree_entry +{ + struct tree *tree; + mode_t mode; + unsigned char sha1[20]; + char name[FLEX_ARRAY]; /* more */ +}; + +struct tree +{ + struct last_object last_tree; + unsigned long entry_count; + struct tree_entry **entries; +}; + +struct branch +{ + struct branch *next_branch; + struct tree_entry tree; + unsigned char sha1[20]; + const char *name; +}; + /* Stats and misc. counters. */ static int max_depth = 10; static unsigned long alloc_count; +static unsigned long branch_count; static unsigned long object_count; static unsigned long duplicate_count; static unsigned long object_count_by_type[9]; @@ -49,6 +74,10 @@ struct object_entry *object_table[1 << 16]; /* Our last blob */ struct last_object last_blob; +/* Branch data */ +struct branch *branches; +struct branch *current_branch; + static void alloc_objects(int cnt) { struct object_entry_block *b; @@ -129,6 +158,32 @@ static ssize_t ywrite(int fd, void *buffer, size_t length) return ret; } +static const char* read_string() +{ + static char sn[PATH_MAX]; + unsigned long slen; + + if (yread(0, &slen, 4) != 4) + die("Can't obtain string"); + if (!slen) + return 0; + if (slen > (PATH_MAX - 1)) + die("Can't handle excessive string length %lu", slen); + + if (yread(0, sn, slen) != slen) + die("Can't obtain string of length %lu", slen); + sn[slen] = 0; + return sn; +} + +static const char* read_required_string() +{ + const char *r = read_string(); + if (!r) + die("Expected string command parameter, didn't find one"); + return r; +} + static unsigned long encode_header( enum object_type type, unsigned long size, @@ -156,7 +211,8 @@ static int store_object( enum object_type type, void *dat, unsigned long datlen, - struct last_object *last) + struct last_object *last, + unsigned char *sha1out) { void *out, *delta; struct object_entry *e; @@ -171,6 +227,8 @@ static int store_object( SHA1_Update(&c, hdr, hdrlen); SHA1_Update(&c, dat, datlen); SHA1_Final(sha1, &c); + if (sha1out) + memcpy(sha1out, sha1, sizeof(sha1)); e = insert_object(sha1); if (e->offset) { @@ -347,10 +405,108 @@ static void new_blob() if (yread(0, dat, datlen) != datlen) die("Con't obtain %lu bytes of blob data", datlen); - if (!store_object(OBJ_BLOB, dat, datlen, &last_blob)) + if (!store_object(OBJ_BLOB, dat, datlen, &last_blob, 0)) free(dat); } +static struct branch* lookup_branch(const char *name) +{ + struct branch *b; + for (b = branches; b; b = b->next_branch) { + if (!strcmp(name, b->name)) + return b; + } + die("No branch named '%s' has been declared", name); +} + +static struct tree* deep_copy_tree (struct tree *t) +{ + struct tree *r = xmalloc(sizeof(struct tree)); + unsigned long i; + + if (t->last_tree.data) { + r->last_tree.data = xmalloc(t->last_tree.len); + r->last_tree.len = t->last_tree.len; + r->last_tree.depth = t->last_tree.depth; + memcpy(r->last_tree.data, t->last_tree.data, t->last_tree.len); + memcpy(r->last_tree.sha1, t->last_tree.sha1, sizeof(t->last_tree.sha1)); + } + + r->entry_count = t->entry_count; + r->entries = xmalloc(t->entry_count * sizeof(struct tree_entry*)); + for (i = 0; i < t->entry_count; i++) { + struct tree_entry *a = t->entries[i]; + struct tree_entry *b; + + b = xmalloc(sizeof(struct tree_entry) + strlen(a->name) + 1); + b->tree = a->tree ? deep_copy_tree(a->tree) : 0; + b->mode = a->mode; + memcpy(b->sha1, a->sha1, sizeof(a->sha1)); + strcpy(b->name, a->name); + r->entries[i] = b; + } + + return r; +} + +static void store_tree (struct tree_entry *e) +{ + struct tree *t = e->tree; + unsigned long maxlen, i; + char *buf, *c; + + if (memcmp(null_sha1, e->sha1, sizeof(e->sha1))) + return; + + maxlen = t->entry_count * 32; + for (i = 0; i < t->entry_count; i++) + maxlen += strlen(t->entries[i]->name); + + buf = c = xmalloc(maxlen); + for (i = 0; i < t->entry_count; i++) { + struct tree_entry *e = t->entries[i]; + c += sprintf(c, "%o %s", e->mode, e->name) + 1; + if (e->tree) + store_tree(e); + memcpy(c, e->sha1, sizeof(e->sha1)); + c += sizeof(e->sha1); + } + + if (!store_object(OBJ_TREE, buf, c - buf, &t->last_tree, e->sha1)) + free(buf); +} + +static void new_branch() +{ + struct branch *nb = xcalloc(1, sizeof(struct branch)); + const char *source_name; + + nb->name = strdup(read_required_string()); + source_name = read_string(); + if (source_name) { + struct branch *sb = lookup_branch(source_name); + nb->tree.tree = deep_copy_tree(sb->tree.tree); + memcpy(nb->tree.sha1, sb->tree.sha1, sizeof(sb->tree.sha1)); + memcpy(nb->sha1, sb->sha1, sizeof(sb->sha1)); + } else { + nb->tree.tree = xcalloc(1, sizeof(struct tree)); + nb->tree.tree->entries = xmalloc(8*sizeof(struct tree_entry*)); + } + nb->next_branch = branches; + branches = nb; + branch_count++; +} + +static void set_branch() +{ + current_branch = lookup_branch(read_required_string()); +} + +static void commit() +{ + store_tree(¤t_branch->tree); +} + int main(int argc, const char **argv) { const char *base_name = argv[1]; @@ -376,7 +532,10 @@ int main(int argc, const char **argv) break; switch (cmd) { - case 'blob': new_blob(); break; + case 'blob': new_blob(); break; + case 'newb': new_branch(); break; + case 'setb': set_branch(); break; + case 'comt': commit(); break; default: die("Invalid command %lu", cmd); } @@ -393,6 +552,7 @@ int main(int argc, const char **argv) fprintf(stderr, " trees : %10lu (%10lu duplicates)\n", object_count_by_type[OBJ_TREE], duplicate_count_by_type[OBJ_TREE]); fprintf(stderr, " commits: %10lu (%10lu duplicates)\n", object_count_by_type[OBJ_COMMIT], duplicate_count_by_type[OBJ_COMMIT]); fprintf(stderr, " tags : %10lu (%10lu duplicates)\n", object_count_by_type[OBJ_TAG], duplicate_count_by_type[OBJ_TAG]); + fprintf(stderr, "Total branches: %10lu\n", branch_count); fprintf(stderr, "---------------------------------------------------\n"); stat(pack_name, &sb); From 463acbe1c60fc5009dac9d033df6c2b9c5037a91 Mon Sep 17 00:00:00 2001 From: "Shawn O. Pearce" Date: Mon, 14 Aug 2006 00:58:19 -0400 Subject: [PATCH 07/81] Added tree and commit writing to fast-import. The tree of the current commit can be altered by file_change commands before the commit gets written to the pack. The file changes are rather primitive as they simply allow removal of a tree entry or setting/adding a tree entry. Currently trees and commits aren't being deltafied when written to the pack and branch reloading from the current pack doesn't work, so at most 5 branches can be worked with at any one time. Signed-off-by: Shawn O. Pearce --- fast-import.c | 914 ++++++++++++++++++++++++++++++++++++++++---------- 1 file changed, 733 insertions(+), 181 deletions(-) diff --git a/fast-import.c b/fast-import.c index 98c5d1cbdd..4605b7469b 100644 --- a/fast-import.c +++ b/fast-import.c @@ -1,9 +1,70 @@ +/* +Format of STDIN stream: + + stream ::= cmd*; + + cmd ::= new_blob + | new_commit + | new_branch + | new_tag + ; + + new_blob ::= 'blob' blob_data; + + new_commit ::= 'comt' ref_name author_committer_msg + file_change* + '0'; + + new_branch ::= 'brch' dst_ref_name src_ref_name; + dst_ref_name ::= ref_name; + src_ref_name ::= ref_name | sha1_exp; + + new_tag ::= 'tagg' ref_name tag_name tagger_msg; + + file_change ::= 'M' path_name hexsha1 + | 'D' path_name + ; + + author_committer_msg ::= len32 + 'author' sp name '<' email '>' ts tz lf + 'committer' sp name '<' email '>' ts tz lf + lf + binary_data; + + tagger_msg ::= len32 + 'tagger' sp name '<' email '>' ts tz lf + lf + binary_data; + + blob_data ::= len32 binary_data; # max len is 2^32-1 + path_name ::= len32 path; # max len is PATH_MAX-1 + ref_name ::= len32 ref; # max len is PATH_MAX-1 + tag_name ::= len32 tag; # max len is PATH_MAX-1 + sha1_exp ::= len32 sha1exp; # max len is PATH_MAX-1 + + len32 ::= # unsigned 32 bit value, native format; + binary_data ::= # file content, not interpreted; + sp ::= # ASCII space character; + lf ::= # ASCII newline (LF) character; + path ::= # GIT style file path, e.g. "a/b/c"; + ref ::= # GIT ref name, e.g. "refs/heads/MOZ_GECKO_EXPERIMENT"; + tag ::= # GIT tag name, e.g. "FIREFOX_1_5"; + sha1exp ::= # Any valid GIT SHA1 expression; + hexsha1 ::= # SHA1 in hexadecimal format; + name ::= # valid GIT author/committer name; + email ::= # valid GIT author/committer email; + ts ::= # time since the epoch in seconds, ascii decimal; + tz ::= # GIT style timezone; +*/ + #include "builtin.h" #include "cache.h" #include "object.h" #include "blob.h" +#include "tree.h" #include "delta.h" #include "pack.h" +#include "refs.h" #include "csum-file.h" struct object_entry @@ -13,9 +74,9 @@ struct object_entry unsigned char sha1[20]; }; -struct object_entry_block +struct object_entry_pool { - struct object_entry_block *next_block; + struct object_entry_pool *next_pool; struct object_entry *next_free; struct object_entry *end; struct object_entry entries[FLEX_ARRAY]; /* more */ @@ -29,31 +90,55 @@ struct last_object unsigned char sha1[20]; }; -struct tree; -struct tree_entry +struct mem_pool { - struct tree *tree; - mode_t mode; - unsigned char sha1[20]; - char name[FLEX_ARRAY]; /* more */ + struct mem_pool *next_pool; + char *next_free; + char *end; + char space[FLEX_ARRAY]; /* more */ }; -struct tree +struct atom_str { - struct last_object last_tree; - unsigned long entry_count; - struct tree_entry **entries; + struct atom_str *next_atom; + int str_len; + char str_dat[FLEX_ARRAY]; /* more */ +}; + +struct tree_content; +struct tree_entry +{ + struct tree_content *tree; + struct atom_str* name; + unsigned int mode; + unsigned char sha1[20]; +}; + +struct tree_content +{ + unsigned int entry_capacity; /* must match avail_tree_content */ + unsigned int entry_count; + struct tree_entry *entries[FLEX_ARRAY]; /* more */ +}; + +struct avail_tree_content +{ + unsigned int entry_capacity; /* must match tree_content */ + struct avail_tree_content *next_avail; }; struct branch { - struct branch *next_branch; - struct tree_entry tree; - unsigned char sha1[20]; + struct branch *table_next_branch; + struct branch *active_next_branch; const char *name; + unsigned long last_commit; + struct tree_entry branch_tree; + unsigned char sha1[20]; }; -/* Stats and misc. counters. */ + +/* Stats and misc. counters */ static int max_depth = 10; static unsigned long alloc_count; static unsigned long branch_count; @@ -62,29 +147,50 @@ static unsigned long duplicate_count; static unsigned long object_count_by_type[9]; static unsigned long duplicate_count_by_type[9]; -/* The .pack file */ +/* Memory pools */ +static size_t mem_pool_alloc = 2*1024*1024 - sizeof(struct mem_pool); +static size_t total_allocd; +static struct mem_pool *mem_pool; + +/* atom management */ +static unsigned int atom_table_sz = 4451; +static unsigned int atom_cnt; +static struct atom_str **atom_table; + +/* The .pack file being generated */ static int pack_fd; static unsigned long pack_offset; static unsigned char pack_sha1[20]; /* Table of objects we've written. */ -struct object_entry_block *blocks; -struct object_entry *object_table[1 << 16]; +static unsigned int object_entry_alloc = 1000; +static struct object_entry_pool *blocks; +static struct object_entry *object_table[1 << 16]; /* Our last blob */ -struct last_object last_blob; +static struct last_object last_blob; + +/* Tree management */ +static unsigned int tree_entry_alloc = 1000; +static void *avail_tree_entry; +static unsigned int avail_tree_table_sz = 100; +static struct avail_tree_content **avail_tree_table; /* Branch data */ -struct branch *branches; -struct branch *current_branch; +static unsigned int max_active_branches = 5; +static unsigned int cur_active_branches; +static unsigned int branch_table_sz = 1039; +static struct branch **branch_table; +static struct branch *active_branches; + static void alloc_objects(int cnt) { - struct object_entry_block *b; + struct object_entry_pool *b; - b = xmalloc(sizeof(struct object_entry_block) + b = xmalloc(sizeof(struct object_entry_pool) + cnt * sizeof(struct object_entry)); - b->next_block = blocks; + b->next_pool = blocks; b->next_free = b->entries; b->end = b->entries + cnt; blocks = b; @@ -96,18 +202,28 @@ static struct object_entry* new_object(unsigned char *sha1) struct object_entry *e; if (blocks->next_free == blocks->end) - alloc_objects(1000); + alloc_objects(object_entry_alloc); e = blocks->next_free++; memcpy(e->sha1, sha1, sizeof(e->sha1)); return e; } +static struct object_entry* find_object(unsigned char *sha1) +{ + unsigned int h = sha1[0] << 8 | sha1[1]; + struct object_entry *e; + for (e = object_table[h]; e; e = e->next) + if (!memcmp(sha1, e->sha1, sizeof(e->sha1))) + return e; + return NULL; +} + static struct object_entry* insert_object(unsigned char *sha1) { unsigned int h = sha1[0] << 8 | sha1[1]; struct object_entry *e = object_table[h]; - struct object_entry *p = 0; + struct object_entry *p = NULL; while (e) { if (!memcmp(sha1, e->sha1, sizeof(e->sha1))) @@ -117,7 +233,7 @@ static struct object_entry* insert_object(unsigned char *sha1) } e = new_object(sha1); - e->next = 0; + e->next = NULL; e->offset = 0; if (p) p->next = e; @@ -126,64 +242,240 @@ static struct object_entry* insert_object(unsigned char *sha1) return e; } -static ssize_t yread(int fd, void *buffer, size_t length) +static unsigned int hc_str(const char *s, size_t len) +{ + unsigned int r = 0; + while (len-- > 0) + r = r * 31 + *s++; + return r; +} + +static void* pool_alloc(size_t len) +{ + struct mem_pool *p; + void *r; + + for (p = mem_pool; p; p = p->next_pool) + if ((p->end - p->next_free >= len)) + break; + + if (!p) { + if (len >= (mem_pool_alloc/2)) { + total_allocd += len; + return xmalloc(len); + } + total_allocd += sizeof(struct mem_pool) + mem_pool_alloc; + p = xmalloc(sizeof(struct mem_pool) + mem_pool_alloc); + p->next_pool = mem_pool; + p->next_free = p->space; + p->end = p->next_free + mem_pool_alloc; + mem_pool = p; + } + + r = p->next_free; + p->next_free += len; + return r; +} + +static void* pool_calloc(size_t count, size_t size) +{ + size_t len = count * size; + void *r = pool_alloc(len); + memset(r, 0, len); + return r; +} + +static char* pool_strdup(const char *s) +{ + char *r = pool_alloc(strlen(s) + 1); + strcpy(r, s); + return r; +} + +static struct atom_str* to_atom(const char *s, size_t len) +{ + unsigned int hc = hc_str(s, len) % atom_table_sz; + struct atom_str *c; + + for (c = atom_table[hc]; c; c = c->next_atom) + if (c->str_len == len && !strncmp(s, c->str_dat, len)) + return c; + + c = pool_alloc(sizeof(struct atom_str) + len + 1); + c->str_len = len; + strncpy(c->str_dat, s, len); + c->str_dat[len] = 0; + c->next_atom = atom_table[hc]; + atom_table[hc] = c; + atom_cnt++; + return c; +} + +static struct branch* lookup_branch(const char *name) +{ + unsigned int hc = hc_str(name, strlen(name)) % branch_table_sz; + struct branch *b; + + for (b = branch_table[hc]; b; b = b->table_next_branch) + if (!strcmp(name, b->name)) + return b; + return NULL; +} + +static struct branch* new_branch(const char *name) +{ + unsigned int hc = hc_str(name, strlen(name)) % branch_table_sz; + struct branch* b = lookup_branch(name); + + if (b) + die("Invalid attempt to create duplicate branch: %s", name); + + b = pool_calloc(1, sizeof(struct branch)); + b->name = pool_strdup(name); + b->table_next_branch = branch_table[hc]; + branch_table[hc] = b; + branch_count++; + return b; +} + +static unsigned int hc_entries(unsigned int cnt) +{ + cnt = cnt & 7 ? (cnt / 8) + 1 : cnt / 8; + return cnt < avail_tree_table_sz ? cnt : avail_tree_table_sz - 1; +} + +static struct tree_content* new_tree_content(unsigned int cnt) +{ + struct avail_tree_content *f, *l = NULL; + struct tree_content *t; + unsigned int hc = hc_entries(cnt); + + for (f = avail_tree_table[hc]; f; l = f, f = f->next_avail) + if (f->entry_capacity >= cnt) + break; + + if (f) { + if (l) + l->next_avail = f->next_avail; + else + avail_tree_table[hc] = f->next_avail; + } else { + cnt = cnt & 7 ? ((cnt / 8) + 1) * 8 : cnt; + f = pool_alloc(sizeof(*t) + sizeof(t->entries[0]) * cnt); + f->entry_capacity = cnt; + } + + t = (struct tree_content*)f; + t->entry_count = 0; + return t; +} + +static void release_tree_entry(struct tree_entry *e); +static void release_tree_content(struct tree_content *t) +{ + struct avail_tree_content *f = (struct avail_tree_content*)t; + unsigned int hc = hc_entries(f->entry_capacity); + unsigned int i; + for (i = 0; i < t->entry_count; i++) + release_tree_entry(t->entries[i]); + f->next_avail = avail_tree_table[hc]; + avail_tree_table[hc] = f; +} + +static struct tree_content* grow_tree_content( + struct tree_content *t, + int amt) +{ + struct tree_content *r = new_tree_content(t->entry_count + amt); + r->entry_count = t->entry_count; + memcpy(r->entries,t->entries,t->entry_count*sizeof(t->entries[0])); + release_tree_content(t); + return r; +} + +static struct tree_entry* new_tree_entry() +{ + struct tree_entry *e; + + if (!avail_tree_entry) { + unsigned int n = tree_entry_alloc; + avail_tree_entry = e = xmalloc(n * sizeof(struct tree_entry)); + while (n--) { + *((void**)e) = e + 1; + e++; + } + } + + e = avail_tree_entry; + avail_tree_entry = *((void**)e); + return e; +} + +static void release_tree_entry(struct tree_entry *e) +{ + if (e->tree) + release_tree_content(e->tree); + *((void**)e) = avail_tree_entry; + avail_tree_entry = e; +} + +static void yread(int fd, void *buffer, size_t length) { ssize_t ret = 0; while (ret < length) { ssize_t size = xread(fd, (char *) buffer + ret, length - ret); - if (size < 0) { - return size; - } - if (size == 0) { - return ret; - } + if (!size) + die("Read from descriptor %i: end of stream", fd); + if (size < 0) + die("Read from descriptor %i: %s", fd, strerror(errno)); ret += size; } - return ret; } -static ssize_t ywrite(int fd, void *buffer, size_t length) +static int optional_read(int fd, void *buffer, size_t length) +{ + ssize_t ret = 0; + while (ret < length) { + ssize_t size = xread(fd, (char *) buffer + ret, length - ret); + if (!size && !ret) + return 1; + if (!size) + die("Read from descriptor %i: end of stream", fd); + if (size < 0) + die("Read from descriptor %i: %s", fd, strerror(errno)); + ret += size; + } + return 0; +} + +static void ywrite(int fd, void *buffer, size_t length) { ssize_t ret = 0; while (ret < length) { ssize_t size = xwrite(fd, (char *) buffer + ret, length - ret); - if (size < 0) { - return size; - } - if (size == 0) { - return ret; - } + if (!size) + die("Write to descriptor %i: end of file", fd); + if (size < 0) + die("Write to descriptor %i: %s", fd, strerror(errno)); ret += size; } - return ret; } -static const char* read_string() +static const char* read_path() { static char sn[PATH_MAX]; unsigned long slen; - if (yread(0, &slen, 4) != 4) - die("Can't obtain string"); + yread(0, &slen, 4); if (!slen) - return 0; + die("Expected string command parameter, didn't find one"); if (slen > (PATH_MAX - 1)) die("Can't handle excessive string length %lu", slen); - - if (yread(0, sn, slen) != slen) - die("Can't obtain string of length %lu", slen); + yread(0, sn, slen); sn[slen] = 0; return sn; } -static const char* read_required_string() -{ - const char *r = read_string(); - if (!r) - die("Expected string command parameter, didn't find one"); - return r; -} - static unsigned long encode_header( enum object_type type, unsigned long size, @@ -234,13 +526,13 @@ static int store_object( if (e->offset) { duplicate_count++; duplicate_count_by_type[type]++; - return 0; + return 1; } e->offset = pack_offset; object_count++; object_count_by_type[type]++; - if (last->data && last->depth < max_depth) + if (last && last->data && last->depth < max_depth) delta = diff_delta(last->data, last->len, dat, datlen, &deltalen, 0); @@ -255,18 +547,16 @@ static int store_object( s.next_in = delta; s.avail_in = deltalen; hdrlen = encode_header(OBJ_DELTA, deltalen, hdr); - if (ywrite(pack_fd, hdr, hdrlen) != hdrlen) - die("Can't write object header: %s", strerror(errno)); - if (ywrite(pack_fd, last->sha1, sizeof(sha1)) != sizeof(sha1)) - die("Can't write object base: %s", strerror(errno)); + ywrite(pack_fd, hdr, hdrlen); + ywrite(pack_fd, last->sha1, sizeof(sha1)); pack_offset += hdrlen + sizeof(sha1); } else { - last->depth = 0; + if (last) + last->depth = 0; s.next_in = dat; s.avail_in = datlen; hdrlen = encode_header(type, datlen, hdr); - if (ywrite(pack_fd, hdr, hdrlen) != hdrlen) - die("Can't write object header: %s", strerror(errno)); + ywrite(pack_fd, hdr, hdrlen); pack_offset += hdrlen; } @@ -276,18 +566,220 @@ static int store_object( /* nothing */; deflateEnd(&s); - if (ywrite(pack_fd, out, s.total_out) != s.total_out) - die("Failed writing compressed data %s", strerror(errno)); + ywrite(pack_fd, out, s.total_out); pack_offset += s.total_out; free(out); if (delta) free(delta); - if (last->data) - free(last->data); - last->data = dat; - last->len = datlen; - memcpy(last->sha1, sha1, sizeof(sha1)); + if (last) { + if (last->data) + free(last->data); + last->data = dat; + last->len = datlen; + memcpy(last->sha1, sha1, sizeof(sha1)); + } + return 0; +} + +static const char *get_mode(const char *str, unsigned int *modep) +{ + unsigned char c; + unsigned int mode = 0; + + while ((c = *str++) != ' ') { + if (c < '0' || c > '7') + return NULL; + mode = (mode << 3) + (c - '0'); + } + *modep = mode; + return str; +} + +static void load_tree(struct tree_entry *root) +{ + struct object_entry *myoe; + struct tree_content *t; + unsigned long size; + char *buf; + const char *c; + char type[20]; + + root->tree = t = new_tree_content(8); + if (!memcmp(root->sha1, null_sha1, 20)) + return; + + myoe = find_object(root->sha1); + if (myoe) { + die("FIXME"); + } else { + buf = read_sha1_file(root->sha1, type, &size); + if (!buf || strcmp(type, tree_type)) + die("Can't load existing tree %s", sha1_to_hex(root->sha1)); + } + + c = buf; + while (c != (buf + size)) { + struct tree_entry *e = new_tree_entry(); + + if (t->entry_count == t->entry_capacity) + root->tree = t = grow_tree_content(t, 8); + t->entries[t->entry_count++] = e; + + e->tree = NULL; + c = get_mode(c, &e->mode); + if (!c) + die("Corrupt mode in %s", sha1_to_hex(root->sha1)); + e->name = to_atom(c, strlen(c)); + c += e->name->str_len + 1; + memcpy(e->sha1, c, sizeof(e->sha1)); + c += 20; + } + free(buf); +} + +static int tecmp (const void *_a, const void *_b) +{ + struct tree_entry *a = *((struct tree_entry**)_a); + struct tree_entry *b = *((struct tree_entry**)_b); + return base_name_compare( + a->name->str_dat, a->name->str_len, a->mode, + b->name->str_dat, b->name->str_len, b->mode); +} + +static void store_tree(struct tree_entry *root) +{ + struct tree_content *t = root->tree; + unsigned int i; + size_t maxlen; + char *buf, *c; + + if (memcmp(root->sha1, null_sha1, 20)) + return; + + maxlen = 0; + for (i = 0; i < t->entry_count; i++) { + maxlen += t->entries[i]->name->str_len + 34; + if (t->entries[i]->tree) + store_tree(t->entries[i]); + } + + qsort(t->entries, t->entry_count, sizeof(t->entries[0]), tecmp); + buf = c = xmalloc(maxlen); + for (i = 0; i < t->entry_count; i++) { + struct tree_entry *e = t->entries[i]; + c += sprintf(c, "%o", e->mode); + *c++ = ' '; + strcpy(c, e->name->str_dat); + c += e->name->str_len + 1; + memcpy(c, e->sha1, 20); + c += 20; + } + store_object(OBJ_TREE, buf, c - buf, NULL, root->sha1); + free(buf); +} + +static int tree_content_set( + struct tree_entry *root, + const char *p, + const unsigned char *sha1, + const unsigned int mode) +{ + struct tree_content *t = root->tree; + const char *slash1; + unsigned int i, n; + struct tree_entry *e; + + slash1 = strchr(p, '/'); + if (slash1) + n = slash1 - p; + else + n = strlen(p); + + for (i = 0; i < t->entry_count; i++) { + e = t->entries[i]; + if (e->name->str_len == n && !strncmp(p, e->name->str_dat, n)) { + if (!slash1) { + if (e->mode == mode && !memcmp(e->sha1, sha1, 20)) + return 0; + e->mode = mode; + memcpy(e->sha1, sha1, 20); + if (e->tree) { + release_tree_content(e->tree); + e->tree = NULL; + } + memcpy(root->sha1, null_sha1, 20); + return 1; + } + if (!S_ISDIR(e->mode)) { + e->tree = new_tree_content(8); + e->mode = 040000; + } + if (!e->tree) + load_tree(e); + if (tree_content_set(e, slash1 + 1, sha1, mode)) { + memcpy(root->sha1, null_sha1, 20); + return 1; + } + return 0; + } + } + + if (t->entry_count == t->entry_capacity) + root->tree = t = grow_tree_content(t, 8); + e = new_tree_entry(); + e->name = to_atom(p, n); + t->entries[t->entry_count++] = e; + if (slash1) { + e->tree = new_tree_content(8); + e->mode = 040000; + tree_content_set(e, slash1 + 1, sha1, mode); + } else { + e->tree = NULL; + e->mode = mode; + memcpy(e->sha1, sha1, 20); + } + memcpy(root->sha1, null_sha1, 20); + return 1; +} + +static int tree_content_remove(struct tree_entry *root, const char *p) +{ + struct tree_content *t = root->tree; + const char *slash1; + unsigned int i, n; + struct tree_entry *e; + + slash1 = strchr(p, '/'); + if (slash1) + n = slash1 - p; + else + n = strlen(p); + + for (i = 0; i < t->entry_count; i++) { + e = t->entries[i]; + if (e->name->str_len == n && !strncmp(p, e->name->str_dat, n)) { + if (!slash1 || !S_ISDIR(e->mode)) + goto del_entry; + if (!e->tree) + load_tree(e); + if (tree_content_remove(e, slash1 + 1)) { + if (!e->tree->entry_count) + goto del_entry; + memcpy(root->sha1, null_sha1, 20); + return 1; + } + return 0; + } + } + return 0; + +del_entry: + for (i++; i < t->entry_count; i++) + t->entries[i-1] = t->entries[i]; + t->entry_count--; + release_tree_entry(e); + memcpy(root->sha1, null_sha1, 20); return 1; } @@ -298,13 +790,9 @@ static void init_pack_header() unsigned long zero = 0; version = htonl(version); - - if (ywrite(pack_fd, (char*)magic, 4) != 4) - die("Can't write pack magic: %s", strerror(errno)); - if (ywrite(pack_fd, &version, 4) != 4) - die("Can't write pack version: %s", strerror(errno)); - if (ywrite(pack_fd, &zero, 4) != 4) - die("Can't write 0 object count: %s", strerror(errno)); + ywrite(pack_fd, (char*)magic, 4); + ywrite(pack_fd, &version, 4); + ywrite(pack_fd, &zero, 4); pack_offset = 4 * 3; } @@ -320,14 +808,12 @@ static void fixup_header_footer() die("Failed seeking to start: %s", strerror(errno)); SHA1_Init(&c); - if (yread(pack_fd, hdr, 8) != 8) - die("Failed reading header: %s", strerror(errno)); + yread(pack_fd, hdr, 8); SHA1_Update(&c, hdr, 8); cnt = htonl(object_count); SHA1_Update(&c, &cnt, 4); - if (ywrite(pack_fd, &cnt, 4) != 4) - die("Failed writing object count: %s", strerror(errno)); + ywrite(pack_fd, &cnt, 4); buf = xmalloc(128 * 1024); for (;;) { @@ -339,8 +825,7 @@ static void fixup_header_footer() free(buf); SHA1_Final(pack_sha1, &c); - if (ywrite(pack_fd, pack_sha1, sizeof(pack_sha1)) != sizeof(pack_sha1)) - die("Failed writing pack checksum: %s", strerror(errno)); + ywrite(pack_fd, pack_sha1, sizeof(pack_sha1)); } static int oecmp (const void *_a, const void *_b) @@ -355,14 +840,14 @@ static void write_index(const char *idx_name) struct sha1file *f; struct object_entry **idx, **c, **last; struct object_entry *e; - struct object_entry_block *o; + struct object_entry_pool *o; unsigned int array[256]; int i; /* Build the sorted table of object IDs. */ idx = xmalloc(object_count * sizeof(struct object_entry*)); c = idx; - for (o = blocks; o; o = o->next_block) + for (o = blocks; o; o = o->next_pool) for (e = o->entries; e != o->next_free; e++) *c++ = e; last = idx + object_count; @@ -393,118 +878,175 @@ static void write_index(const char *idx_name) free(idx); } -static void new_blob() +static void dump_branches() +{ + static const char *msg = "fast-import"; + unsigned int i; + struct branch *b; + struct ref_lock *lock; + + for (i = 0; i < branch_table_sz; i++) { + for (b = branch_table[i]; b; b = b->table_next_branch) { + lock = lock_any_ref_for_update(b->name, NULL, 0); + if (!lock || write_ref_sha1(lock, b->sha1, msg) < 0) + die("Can't write %s", b->name); + } + } +} + +static void cmd_new_blob() { unsigned long datlen; + unsigned char sha1[20]; void *dat; - if (yread(0, &datlen, 4) != 4) - die("Can't obtain blob length"); - + yread(0, &datlen, 4); dat = xmalloc(datlen); - if (yread(0, dat, datlen) != datlen) - die("Con't obtain %lu bytes of blob data", datlen); - - if (!store_object(OBJ_BLOB, dat, datlen, &last_blob, 0)) + yread(0, dat, datlen); + if (store_object(OBJ_BLOB, dat, datlen, &last_blob, sha1)) free(dat); } -static struct branch* lookup_branch(const char *name) +static void unload_one_branch() { - struct branch *b; - for (b = branches; b; b = b->next_branch) { - if (!strcmp(name, b->name)) - return b; + while (cur_active_branches >= max_active_branches) { + unsigned long min_commit = ULONG_MAX; + struct branch *e, *l = NULL, *p = NULL; + + for (e = active_branches; e; e = e->active_next_branch) { + if (e->last_commit < min_commit) { + p = l; + min_commit = e->last_commit; + } + l = e; + } + + if (p) { + e = p->active_next_branch; + p->active_next_branch = e->active_next_branch; + } else { + e = active_branches; + active_branches = e->active_next_branch; + } + e->active_next_branch = NULL; + if (e->branch_tree.tree) { + release_tree_content(e->branch_tree.tree); + e->branch_tree.tree = NULL; + } + cur_active_branches--; } - die("No branch named '%s' has been declared", name); } -static struct tree* deep_copy_tree (struct tree *t) +static void load_branch(struct branch *b) { - struct tree *r = xmalloc(sizeof(struct tree)); - unsigned long i; - - if (t->last_tree.data) { - r->last_tree.data = xmalloc(t->last_tree.len); - r->last_tree.len = t->last_tree.len; - r->last_tree.depth = t->last_tree.depth; - memcpy(r->last_tree.data, t->last_tree.data, t->last_tree.len); - memcpy(r->last_tree.sha1, t->last_tree.sha1, sizeof(t->last_tree.sha1)); - } - - r->entry_count = t->entry_count; - r->entries = xmalloc(t->entry_count * sizeof(struct tree_entry*)); - for (i = 0; i < t->entry_count; i++) { - struct tree_entry *a = t->entries[i]; - struct tree_entry *b; - - b = xmalloc(sizeof(struct tree_entry) + strlen(a->name) + 1); - b->tree = a->tree ? deep_copy_tree(a->tree) : 0; - b->mode = a->mode; - memcpy(b->sha1, a->sha1, sizeof(a->sha1)); - strcpy(b->name, a->name); - r->entries[i] = b; - } - - return r; + load_tree(&b->branch_tree); + b->active_next_branch = active_branches; + active_branches = b; + cur_active_branches++; } -static void store_tree (struct tree_entry *e) +static void file_change_m(struct branch *b) { - struct tree *t = e->tree; - unsigned long maxlen, i; - char *buf, *c; + const char *path = read_path(); + char hexsha1[41]; + unsigned char sha1[20]; - if (memcmp(null_sha1, e->sha1, sizeof(e->sha1))) - return; + yread(0, hexsha1, 40); + hexsha1[40] = 0; - maxlen = t->entry_count * 32; - for (i = 0; i < t->entry_count; i++) - maxlen += strlen(t->entries[i]->name); + if (get_sha1_hex(hexsha1, sha1)) + die("Invalid sha1 %s for %s", hexsha1, path); - buf = c = xmalloc(maxlen); - for (i = 0; i < t->entry_count; i++) { - struct tree_entry *e = t->entries[i]; - c += sprintf(c, "%o %s", e->mode, e->name) + 1; - if (e->tree) - store_tree(e); - memcpy(c, e->sha1, sizeof(e->sha1)); - c += sizeof(e->sha1); + tree_content_set(&b->branch_tree, path, sha1, 0100644); +} + +static void file_change_d(struct branch *b) +{ + tree_content_remove(&b->branch_tree, read_path()); +} + +static void cmd_new_commit() +{ + static const unsigned int max_hdr_len = 94; + const char *name = read_path(); + struct branch *b = lookup_branch(name); + unsigned int acmsglen; + char *body, *c; + + if (!b) + die("Branch not declared: %s", name); + if (!b->branch_tree.tree) { + unload_one_branch(); + load_branch(b); } - if (!store_object(OBJ_TREE, buf, c - buf, &t->last_tree, e->sha1)) - free(buf); -} + /* author_committer_msg */ + yread(0, &acmsglen, 4); + body = xmalloc(acmsglen + max_hdr_len); + c = body + max_hdr_len; + yread(0, c, acmsglen); -static void new_branch() -{ - struct branch *nb = xcalloc(1, sizeof(struct branch)); - const char *source_name; - - nb->name = strdup(read_required_string()); - source_name = read_string(); - if (source_name) { - struct branch *sb = lookup_branch(source_name); - nb->tree.tree = deep_copy_tree(sb->tree.tree); - memcpy(nb->tree.sha1, sb->tree.sha1, sizeof(sb->tree.sha1)); - memcpy(nb->sha1, sb->sha1, sizeof(sb->sha1)); - } else { - nb->tree.tree = xcalloc(1, sizeof(struct tree)); - nb->tree.tree->entries = xmalloc(8*sizeof(struct tree_entry*)); + /* file_change* */ + for (;;) { + unsigned char cmd; + yread(0, &cmd, 1); + if (cmd == '0') + break; + else if (cmd == 'M') + file_change_m(b); + else if (cmd == 'D') + file_change_d(b); + else + die("Unsupported file_change: %c", cmd); } - nb->next_branch = branches; - branches = nb; - branch_count++; + + if (memcmp(b->sha1, null_sha1, 20)) { + sprintf(c - 48, "parent %s", sha1_to_hex(b->sha1)); + *(c - 1) = '\n'; + c -= 48; + } + store_tree(&b->branch_tree); + sprintf(c - 46, "tree %s", sha1_to_hex(b->branch_tree.sha1)); + *(c - 1) = '\n'; + c -= 46; + + store_object(OBJ_COMMIT, + c, (body + max_hdr_len + acmsglen) - c, + NULL, b->sha1); + free(body); + b->last_commit = object_count_by_type[OBJ_COMMIT]; } -static void set_branch() +static void cmd_new_branch() { - current_branch = lookup_branch(read_required_string()); -} + struct branch *b = new_branch(read_path()); + const char *base = read_path(); + struct branch *s = lookup_branch(base); -static void commit() -{ - store_tree(¤t_branch->tree); + if (!strcmp(b->name, base)) + die("Can't create a branch from itself: %s", base); + else if (s) { + memcpy(b->sha1, s->sha1, 20); + memcpy(b->branch_tree.sha1, s->branch_tree.sha1, 20); + } + else if (!get_sha1(base, b->sha1)) { + if (!memcmp(b->sha1, null_sha1, 20)) + memcpy(b->branch_tree.sha1, null_sha1, 20); + else { + unsigned long size; + char *buf; + + buf = read_object_with_reference(b->sha1, + type_names[OBJ_COMMIT], &size, b->sha1); + if (!buf || size < 46) + die("Not a valid commit: %s", base); + if (memcmp("tree ", buf, 5) + || get_sha1_hex(buf + 5, b->branch_tree.sha1)) + die("The commit %s is corrupt", sha1_to_hex(b->sha1)); + free(buf); + } + } else + die("Not a SHA1 or branch: %s", base); } int main(int argc, const char **argv) @@ -515,6 +1057,9 @@ int main(int argc, const char **argv) char *idx_name; struct stat sb; + setup_ident(); + git_config(git_default_config); + pack_name = xmalloc(strlen(base_name) + 6); sprintf(pack_name, "%s.pack", base_name); idx_name = xmalloc(strlen(base_name) + 5); @@ -525,17 +1070,21 @@ int main(int argc, const char **argv) die("Can't create %s: %s", pack_name, strerror(errno)); alloc_objects(est_obj_cnt); + + atom_table = xcalloc(atom_table_sz, sizeof(struct atom_str*)); + branch_table = xcalloc(branch_table_sz, sizeof(struct branch*)); + avail_tree_table = xcalloc(avail_tree_table_sz, sizeof(struct avail_tree_content*)); + init_pack_header(); for (;;) { unsigned long cmd; - if (yread(0, &cmd, 4) != 4) + if (optional_read(0, &cmd, 4)) break; - switch (cmd) { - case 'blob': new_blob(); break; - case 'newb': new_branch(); break; - case 'setb': set_branch(); break; - case 'comt': commit(); break; + switch (ntohl(cmd)) { + case 'blob': cmd_new_blob(); break; + case 'comt': cmd_new_commit(); break; + case 'brch': cmd_new_branch(); break; default: die("Invalid command %lu", cmd); } @@ -543,6 +1092,7 @@ int main(int argc, const char **argv) fixup_header_footer(); close(pack_fd); write_index(idx_name); + dump_branches(); fprintf(stderr, "%s statistics:\n", argv[0]); fprintf(stderr, "---------------------------------------------------\n"); @@ -553,6 +1103,8 @@ int main(int argc, const char **argv) fprintf(stderr, " commits: %10lu (%10lu duplicates)\n", object_count_by_type[OBJ_COMMIT], duplicate_count_by_type[OBJ_COMMIT]); fprintf(stderr, " tags : %10lu (%10lu duplicates)\n", object_count_by_type[OBJ_TAG], duplicate_count_by_type[OBJ_TAG]); fprintf(stderr, "Total branches: %10lu\n", branch_count); + fprintf(stderr, "Total atoms: %10u\n", atom_cnt); + fprintf(stderr, "Memory pools: %10lu MiB\n", total_allocd/(1024*1024)); fprintf(stderr, "---------------------------------------------------\n"); stat(pack_name, &sb); From 7111feede9c5905199ba48645fadc369faca5711 Mon Sep 17 00:00:00 2001 From: "Shawn O. Pearce" Date: Mon, 14 Aug 2006 02:50:18 -0400 Subject: [PATCH 08/81] Implement blob ID validation in fast-import. When accepting revision SHA1 IDs from the frontend verify the SHA1 actually refers to a blob and is known to exist. Its an error to use a SHA1 in a tree if the blob doesn't exist as this would cause git-fsck-objects to report a missing blob should the pack get closed without the blob being appended into it or a subsequent pack. So right now we'll just ask that the frontend "pre-declare" any blobs it wants to use in a tree before it can use them. Signed-off-by: Shawn O. Pearce --- fast-import.c | 28 ++++++++++++++++++++++++---- 1 file changed, 24 insertions(+), 4 deletions(-) diff --git a/fast-import.c b/fast-import.c index 4605b7469b..95b84f57e5 100644 --- a/fast-import.c +++ b/fast-import.c @@ -70,6 +70,7 @@ Format of STDIN stream: struct object_entry { struct object_entry *next; + enum object_type type; unsigned long offset; unsigned char sha1[20]; }; @@ -528,6 +529,7 @@ static int store_object( duplicate_count_by_type[type]++; return 1; } + e->type = type; e->offset = pack_offset; object_count++; object_count_by_type[type]++; @@ -713,7 +715,7 @@ static int tree_content_set( } if (!S_ISDIR(e->mode)) { e->tree = new_tree_content(8); - e->mode = 040000; + e->mode = S_IFDIR; } if (!e->tree) load_tree(e); @@ -732,7 +734,7 @@ static int tree_content_set( t->entries[t->entry_count++] = e; if (slash1) { e->tree = new_tree_content(8); - e->mode = 040000; + e->mode = S_IFDIR; tree_content_set(e, slash1 + 1, sha1, mode); } else { e->tree = NULL; @@ -948,16 +950,28 @@ static void load_branch(struct branch *b) static void file_change_m(struct branch *b) { const char *path = read_path(); + struct object_entry *oe; char hexsha1[41]; unsigned char sha1[20]; + char type[20]; yread(0, hexsha1, 40); hexsha1[40] = 0; if (get_sha1_hex(hexsha1, sha1)) die("Invalid sha1 %s for %s", hexsha1, path); + oe = find_object(sha1); + if (oe) { + if (oe->type != OBJ_BLOB) + die("%s is a %s not a blob (for %s)", hexsha1, type_names[oe->type], path); + } else { + if (sha1_object_info(sha1, type, NULL)) + die("No blob %s for %s", hexsha1, path); + if (strcmp(blob_type, type)) + die("%s is a %s not a blob (for %s)", hexsha1, type, path); + } - tree_content_set(&b->branch_tree, path, sha1, 0100644); + tree_content_set(&b->branch_tree, path, sha1, S_IFREG | 0644); } static void file_change_d(struct branch *b) @@ -986,6 +1000,10 @@ static void cmd_new_commit() c = body + max_hdr_len; yread(0, c, acmsglen); + /* oddly enough this is all that fsck-objects cares about */ + if (memcmp(c, "author ", 7)) + die("Invalid commit format on branch %s", name); + /* file_change* */ for (;;) { unsigned char cmd; @@ -1104,7 +1122,9 @@ int main(int argc, const char **argv) fprintf(stderr, " tags : %10lu (%10lu duplicates)\n", object_count_by_type[OBJ_TAG], duplicate_count_by_type[OBJ_TAG]); fprintf(stderr, "Total branches: %10lu\n", branch_count); fprintf(stderr, "Total atoms: %10u\n", atom_cnt); - fprintf(stderr, "Memory pools: %10lu MiB\n", total_allocd/(1024*1024)); + fprintf(stderr, "Memory total: %10lu KiB\n", (total_allocd + alloc_count*sizeof(struct object_entry))/1024); + fprintf(stderr, " pools: %10lu KiB\n", total_allocd/1024); + fprintf(stderr, " objects: %10lu KiB\n", (alloc_count*sizeof(struct object_entry))/1024); fprintf(stderr, "---------------------------------------------------\n"); stat(pack_name, &sb); From c44cdc7eef212ec09901eb2e0996476e0468ed88 Mon Sep 17 00:00:00 2001 From: "Shawn O. Pearce" Date: Mon, 14 Aug 2006 20:16:28 -0400 Subject: [PATCH 09/81] Converted fast-import to a text based protocol. Frontend clients can now send a text stream to fast-import rather than a binary stream. This should facilitate developing frontend software as the data stream is easier to view, manipulate and debug my hand and Mark-I eyeball. Signed-off-by: Shawn O. Pearce --- fast-import.c | 470 ++++++++++++++++++++++++++++++++++---------------- 1 file changed, 318 insertions(+), 152 deletions(-) diff --git a/fast-import.c b/fast-import.c index 95b84f57e5..2953e80cde 100644 --- a/fast-import.c +++ b/fast-import.c @@ -4,57 +4,93 @@ Format of STDIN stream: stream ::= cmd*; cmd ::= new_blob - | new_commit | new_branch + | new_commit | new_tag ; - new_blob ::= 'blob' blob_data; + new_blob ::= 'blob' lf + mark? + file_content; + file_content ::= data; - new_commit ::= 'comt' ref_name author_committer_msg + new_branch ::= 'branch' sp ref_str lf + ('from' sp (ref_str | hexsha1 | sha1exp_str | idnum) lf)? + lf; + + new_commit ::= 'commit' sp ref_str lf + mark? + ('author' sp name '<' email '>' ts tz lf)? + 'committer' sp name '<' email '>' ts tz lf + commit_msg file_change* - '0'; + lf; + commit_msg ::= data; - new_branch ::= 'brch' dst_ref_name src_ref_name; - dst_ref_name ::= ref_name; - src_ref_name ::= ref_name | sha1_exp; - - new_tag ::= 'tagg' ref_name tag_name tagger_msg; - - file_change ::= 'M' path_name hexsha1 - | 'D' path_name + file_change ::= 'M' sp mode sp (hexsha1 | idnum) sp path_str lf + | 'D' sp path_str lf ; + mode ::= '644' | '755'; - author_committer_msg ::= len32 - 'author' sp name '<' email '>' ts tz lf - 'committer' sp name '<' email '>' ts tz lf - lf - binary_data; + new_tag ::= 'tag' sp tag_str lf + 'from' sp (ref_str | hexsha1 | sha1exp_str | idnum) lf + 'tagger' sp name '<' email '>' ts tz lf + tag_msg; + tag_msg ::= data; - tagger_msg ::= len32 - 'tagger' sp name '<' email '>' ts tz lf - lf - binary_data; + # note: the first idnum in a stream should be 1 and subsequent + # idnums should not have gaps between values as this will cause + # the stream parser to reserve space for the gapped values. An + # idnum can be updated in the future to a new object by issuing + # a new mark directive with the old idnum. + # + mark ::= 'mark' sp idnum lf; - blob_data ::= len32 binary_data; # max len is 2^32-1 - path_name ::= len32 path; # max len is PATH_MAX-1 - ref_name ::= len32 ref; # max len is PATH_MAX-1 - tag_name ::= len32 tag; # max len is PATH_MAX-1 - sha1_exp ::= len32 sha1exp; # max len is PATH_MAX-1 + # note: declen indicates the length of binary_data in bytes. + # declen does not include the lf preceeding or trailing the + # binary data. + # + data ::= 'data' sp declen lf + binary_data + lf; - len32 ::= # unsigned 32 bit value, native format; + # note: quoted strings are C-style quoting supporting \c for + # common escapes of 'c' (e..g \n, \t, \\, \") or \nnn where nnn + # is the signed byte value in octal. Note that the only + # characters which must actually be escaped to protect the + # stream formatting is: \, " and LF. Otherwise these values + # are UTF8. + # + ref_str ::= ref | '"' quoted(ref) '"' ; + sha1exp_str ::= sha1exp | '"' quoted(sha1exp) '"' ; + tag_str ::= tag | '"' quoted(tag) '"' ; + path_str ::= path | '"' quoted(path) '"' ; + + declen ::= # unsigned 32 bit value, ascii base10 notation; binary_data ::= # file content, not interpreted; + sp ::= # ASCII space character; lf ::= # ASCII newline (LF) character; - path ::= # GIT style file path, e.g. "a/b/c"; - ref ::= # GIT ref name, e.g. "refs/heads/MOZ_GECKO_EXPERIMENT"; - tag ::= # GIT tag name, e.g. "FIREFOX_1_5"; + + # note: a colon (':') must precede the numerical value assigned to + # an idnum. This is to distinguish it from a ref or tag name as + # GIT does not permit ':' in ref or tag strings. + # + idnum ::= ':' declen; + path ::= # GIT style file path, e.g. "a/b/c"; + ref ::= # GIT ref name, e.g. "refs/heads/MOZ_GECKO_EXPERIMENT"; + tag ::= # GIT tag name, e.g. "FIREFOX_1_5"; sha1exp ::= # Any valid GIT SHA1 expression; hexsha1 ::= # SHA1 in hexadecimal format; - name ::= # valid GIT author/committer name; + + # note: name and email are UTF8 strings, however name must not + # contain '<' or lf and email must not contain any of the + # following: '<', '>', lf. + # + name ::= # valid GIT author/committer name; email ::= # valid GIT author/committer email; - ts ::= # time since the epoch in seconds, ascii decimal; - tz ::= # GIT style timezone; + ts ::= # time since the epoch in seconds, ascii base10 notation; + tz ::= # GIT style timezone; */ #include "builtin.h" @@ -66,6 +102,8 @@ Format of STDIN stream: #include "pack.h" #include "refs.h" #include "csum-file.h" +#include "strbuf.h" +#include "quote.h" struct object_entry { @@ -153,7 +191,7 @@ static size_t mem_pool_alloc = 2*1024*1024 - sizeof(struct mem_pool); static size_t total_allocd; static struct mem_pool *mem_pool; -/* atom management */ +/* Atom management */ static unsigned int atom_table_sz = 4451; static unsigned int atom_cnt; static struct atom_str **atom_table; @@ -184,6 +222,10 @@ static unsigned int branch_table_sz = 1039; static struct branch **branch_table; static struct branch *active_branches; +/* Input stream parsing */ +static struct strbuf command_buf; +static unsigned long command_mark; + static void alloc_objects(int cnt) { @@ -330,6 +372,8 @@ static struct branch* new_branch(const char *name) if (b) die("Invalid attempt to create duplicate branch: %s", name); + if (check_ref_format(name)) + die("Branch name doesn't conform to GIT standards: %s", name); b = pool_calloc(1, sizeof(struct branch)); b->name = pool_strdup(name); @@ -433,22 +477,6 @@ static void yread(int fd, void *buffer, size_t length) } } -static int optional_read(int fd, void *buffer, size_t length) -{ - ssize_t ret = 0; - while (ret < length) { - ssize_t size = xread(fd, (char *) buffer + ret, length - ret); - if (!size && !ret) - return 1; - if (!size) - die("Read from descriptor %i: end of stream", fd); - if (size < 0) - die("Read from descriptor %i: %s", fd, strerror(errno)); - ret += size; - } - return 0; -} - static void ywrite(int fd, void *buffer, size_t length) { ssize_t ret = 0; @@ -462,24 +490,9 @@ static void ywrite(int fd, void *buffer, size_t length) } } -static const char* read_path() -{ - static char sn[PATH_MAX]; - unsigned long slen; - - yread(0, &slen, 4); - if (!slen) - die("Expected string command parameter, didn't find one"); - if (slen > (PATH_MAX - 1)) - die("Can't handle excessive string length %lu", slen); - yread(0, sn, slen); - sn[slen] = 0; - return sn; -} - -static unsigned long encode_header( +static size_t encode_header( enum object_type type, - unsigned long size, + size_t size, unsigned char *hdr) { int n = 1; @@ -503,7 +516,7 @@ static unsigned long encode_header( static int store_object( enum object_type type, void *dat, - unsigned long datlen, + size_t datlen, struct last_object *last, unsigned char *sha1out) { @@ -896,15 +909,57 @@ static void dump_branches() } } +static void read_next_command() +{ + read_line(&command_buf, stdin, '\n'); +} + +static void cmd_mark() +{ + if (!strncmp("mark :", command_buf.buf, 6)) { + command_mark = strtoul(command_buf.buf + 6, NULL, 10); + read_next_command(); + } + else + command_mark = 0; +} + +static void* cmd_data (size_t *size) +{ + size_t n = 0; + void *buffer; + size_t length; + + if (strncmp("data ", command_buf.buf, 5)) + die("Expected 'data n' command, found: %s", command_buf.buf); + + length = strtoul(command_buf.buf + 5, NULL, 10); + buffer = xmalloc(length); + + while (n < length) { + size_t s = fread((char*)buffer + n, 1, length - n, stdin); + if (!s && feof(stdin)) + die("EOF in data (%lu bytes remaining)", length - n); + n += s; + } + + if (fgetc(stdin) != '\n') + die("An lf did not trail the binary data as expected."); + + *size = length; + return buffer; +} + static void cmd_new_blob() { - unsigned long datlen; - unsigned char sha1[20]; + size_t datlen; void *dat; + unsigned char sha1[20]; + + read_next_command(); + cmd_mark(); + dat = cmd_data(&datlen); - yread(0, &datlen, 4); - dat = xmalloc(datlen); - yread(0, dat, datlen); if (store_object(OBJ_BLOB, dat, datlen, &last_blob, sha1)) free(dat); } @@ -949,122 +1004,231 @@ static void load_branch(struct branch *b) static void file_change_m(struct branch *b) { - const char *path = read_path(); + const char *p = command_buf.buf + 2; + char *p_uq; + const char *endp; struct object_entry *oe; - char hexsha1[41]; unsigned char sha1[20]; + unsigned int mode; char type[20]; - yread(0, hexsha1, 40); - hexsha1[40] = 0; + p = get_mode(p, &mode); + if (!p) + die("Corrupt mode: %s", command_buf.buf); + switch (mode) { + case S_IFREG | 0644: + case S_IFREG | 0755: + case 0644: + case 0755: + /* ok */ + break; + default: + die("Corrupt mode: %s", command_buf.buf); + } + + if (get_sha1_hex(p, sha1)) + die("Invalid SHA1: %s", command_buf.buf); + p += 40; + if (*p++ != ' ') + die("Missing space after SHA1: %s", command_buf.buf); + + p_uq = unquote_c_style(p, &endp); + if (p_uq) { + if (*endp) + die("Garbage after path in: %s", command_buf.buf); + p = p_uq; + } - if (get_sha1_hex(hexsha1, sha1)) - die("Invalid sha1 %s for %s", hexsha1, path); oe = find_object(sha1); if (oe) { if (oe->type != OBJ_BLOB) - die("%s is a %s not a blob (for %s)", hexsha1, type_names[oe->type], path); + die("Not a blob (actually a %s): %s", + command_buf.buf, type_names[oe->type]); } else { if (sha1_object_info(sha1, type, NULL)) - die("No blob %s for %s", hexsha1, path); + die("Blob not found: %s", command_buf.buf); if (strcmp(blob_type, type)) - die("%s is a %s not a blob (for %s)", hexsha1, type, path); + die("Not a blob (actually a %s): %s", + command_buf.buf, type); } - tree_content_set(&b->branch_tree, path, sha1, S_IFREG | 0644); + tree_content_set(&b->branch_tree, p, sha1, S_IFREG | mode); + + if (p_uq) + free(p_uq); } static void file_change_d(struct branch *b) { - tree_content_remove(&b->branch_tree, read_path()); + const char *p = command_buf.buf + 2; + char *p_uq; + const char *endp; + + p_uq = unquote_c_style(p, &endp); + if (p_uq) { + if (*endp) + die("Garbage after path in: %s", command_buf.buf); + p = p_uq; + } + tree_content_remove(&b->branch_tree, p); + if (p_uq) + free(p_uq); } static void cmd_new_commit() { - static const unsigned int max_hdr_len = 94; - const char *name = read_path(); - struct branch *b = lookup_branch(name); - unsigned int acmsglen; - char *body, *c; + struct branch *b; + void *msg; + size_t msglen; + char *str_uq; + const char *endp; + char *sp; + char *author = NULL; + char *committer = NULL; + char *body; + /* Obtain the branch name from the rest of our command */ + sp = strchr(command_buf.buf, ' ') + 1; + str_uq = unquote_c_style(sp, &endp); + if (str_uq) { + if (*endp) + die("Garbage after ref in: %s", command_buf.buf); + sp = str_uq; + } + b = lookup_branch(sp); if (!b) - die("Branch not declared: %s", name); + die("Branch not declared: %s", sp); + if (str_uq) + free(str_uq); + + read_next_command(); + cmd_mark(); + if (!strncmp("author ", command_buf.buf, 7)) { + author = strdup(command_buf.buf); + read_next_command(); + } + if (!strncmp("committer ", command_buf.buf, 10)) { + committer = strdup(command_buf.buf); + read_next_command(); + } + if (!committer) + die("Expected committer but didn't get one"); + msg = cmd_data(&msglen); + + /* ensure the branch is active/loaded */ if (!b->branch_tree.tree) { unload_one_branch(); load_branch(b); } - /* author_committer_msg */ - yread(0, &acmsglen, 4); - body = xmalloc(acmsglen + max_hdr_len); - c = body + max_hdr_len; - yread(0, c, acmsglen); - - /* oddly enough this is all that fsck-objects cares about */ - if (memcmp(c, "author ", 7)) - die("Invalid commit format on branch %s", name); - /* file_change* */ for (;;) { - unsigned char cmd; - yread(0, &cmd, 1); - if (cmd == '0') + read_next_command(); + if (1 == command_buf.len) break; - else if (cmd == 'M') + else if (!strncmp("M ", command_buf.buf, 2)) file_change_m(b); - else if (cmd == 'D') + else if (!strncmp("D ", command_buf.buf, 2)) file_change_d(b); else - die("Unsupported file_change: %c", cmd); + die("Unsupported file_change: %s", command_buf.buf); } - if (memcmp(b->sha1, null_sha1, 20)) { - sprintf(c - 48, "parent %s", sha1_to_hex(b->sha1)); - *(c - 1) = '\n'; - c -= 48; - } + /* build the tree and the commit */ store_tree(&b->branch_tree); - sprintf(c - 46, "tree %s", sha1_to_hex(b->branch_tree.sha1)); - *(c - 1) = '\n'; - c -= 46; + body = xmalloc(97 + msglen + + (author + ? strlen(author) + strlen(committer) + : 2 * strlen(committer))); + sp = body; + sp += sprintf(sp, "tree %s\n", sha1_to_hex(b->branch_tree.sha1)); + if (memcmp(b->sha1, null_sha1, 20)) + sp += sprintf(sp, "parent %s\n", sha1_to_hex(b->sha1)); + if (author) + sp += sprintf(sp, "%s\n", author); + else + sp += sprintf(sp, "author %s\n", committer + 10); + sp += sprintf(sp, "%s\n\n", committer); + memcpy(sp, msg, msglen); + sp += msglen; + if (author) + free(author); + free(committer); + free(msg); - store_object(OBJ_COMMIT, - c, (body + max_hdr_len + acmsglen) - c, - NULL, b->sha1); + store_object(OBJ_COMMIT, body, sp - body, NULL, b->sha1); free(body); b->last_commit = object_count_by_type[OBJ_COMMIT]; } static void cmd_new_branch() { - struct branch *b = new_branch(read_path()); - const char *base = read_path(); - struct branch *s = lookup_branch(base); + struct branch *b; + char *str_uq; + const char *endp; + char *sp; - if (!strcmp(b->name, base)) - die("Can't create a branch from itself: %s", base); - else if (s) { - memcpy(b->sha1, s->sha1, 20); - memcpy(b->branch_tree.sha1, s->branch_tree.sha1, 20); + /* Obtain the new branch name from the rest of our command */ + sp = strchr(command_buf.buf, ' ') + 1; + str_uq = unquote_c_style(sp, &endp); + if (str_uq) { + if (*endp) + die("Garbage after ref in: %s", command_buf.buf); + sp = str_uq; } - else if (!get_sha1(base, b->sha1)) { - if (!memcmp(b->sha1, null_sha1, 20)) - memcpy(b->branch_tree.sha1, null_sha1, 20); - else { - unsigned long size; - char *buf; + b = new_branch(sp); + if (str_uq) + free(str_uq); + read_next_command(); - buf = read_object_with_reference(b->sha1, - type_names[OBJ_COMMIT], &size, b->sha1); - if (!buf || size < 46) - die("Not a valid commit: %s", base); - if (memcmp("tree ", buf, 5) - || get_sha1_hex(buf + 5, b->branch_tree.sha1)) - die("The commit %s is corrupt", sha1_to_hex(b->sha1)); - free(buf); + /* from ... */ + if (!strncmp("from ", command_buf.buf, 5)) { + const char *from; + struct branch *s; + + from = strchr(command_buf.buf, ' ') + 1; + str_uq = unquote_c_style(from, &endp); + if (str_uq) { + if (*endp) + die("Garbage after string in: %s", command_buf.buf); + from = str_uq; } - } else - die("Not a SHA1 or branch: %s", base); + + s = lookup_branch(from); + if (b == s) + die("Can't create a branch from itself: %s", b->name); + else if (s) { + memcpy(b->sha1, s->sha1, 20); + memcpy(b->branch_tree.sha1, s->branch_tree.sha1, 20); + } else if (!get_sha1(from, b->sha1)) { + if (!memcmp(b->sha1, null_sha1, 20)) + memcpy(b->branch_tree.sha1, null_sha1, 20); + else { + unsigned long size; + char *buf; + + buf = read_object_with_reference(b->sha1, + type_names[OBJ_COMMIT], &size, b->sha1); + if (!buf || size < 46) + die("Not a valid commit: %s", from); + if (memcmp("tree ", buf, 5) + || get_sha1_hex(buf + 5, b->branch_tree.sha1)) + die("The commit %s is corrupt", sha1_to_hex(b->sha1)); + free(buf); + } + } else + die("Invalid ref name or SHA1 expression: %s", from); + + if (str_uq) + free(str_uq); + read_next_command(); + } else { + memcpy(b->sha1, null_sha1, 20); + memcpy(b->branch_tree.sha1, null_sha1, 20); + } + + if (command_buf.eof || command_buf.len > 1) + die("An lf did not terminate the branch command as expected."); } int main(int argc, const char **argv) @@ -1087,26 +1251,28 @@ int main(int argc, const char **argv) if (pack_fd < 0) die("Can't create %s: %s", pack_name, strerror(errno)); + init_pack_header(); alloc_objects(est_obj_cnt); + strbuf_init(&command_buf); atom_table = xcalloc(atom_table_sz, sizeof(struct atom_str*)); branch_table = xcalloc(branch_table_sz, sizeof(struct branch*)); avail_tree_table = xcalloc(avail_tree_table_sz, sizeof(struct avail_tree_content*)); - init_pack_header(); for (;;) { - unsigned long cmd; - if (optional_read(0, &cmd, 4)) + read_next_command(); + if (command_buf.eof) break; - - switch (ntohl(cmd)) { - case 'blob': cmd_new_blob(); break; - case 'comt': cmd_new_commit(); break; - case 'brch': cmd_new_branch(); break; - default: - die("Invalid command %lu", cmd); - } + else if (!strcmp("blob", command_buf.buf)) + cmd_new_blob(); + else if (!strncmp("branch ", command_buf.buf, 7)) + cmd_new_branch(); + else if (!strncmp("commit ", command_buf.buf, 7)) + cmd_new_commit(); + else + die("Unsupported command: %s", command_buf.buf); } + fixup_header_footer(); close(pack_fd); write_index(idx_name); From c90be46abdbd102ab8e9af0303d33976d552ae58 Mon Sep 17 00:00:00 2001 From: "Shawn O. Pearce" Date: Wed, 16 Aug 2006 01:57:57 -0400 Subject: [PATCH 10/81] Changed fast-import's pack header creation to use pack.h Signed-off-by: Shawn O. Pearce Signed-off-by: Shawn O. Pearce --- fast-import.c | 15 +++++++-------- 1 file changed, 7 insertions(+), 8 deletions(-) diff --git a/fast-import.c b/fast-import.c index 2953e80cde..d5651693ba 100644 --- a/fast-import.c +++ b/fast-import.c @@ -800,15 +800,14 @@ del_entry: static void init_pack_header() { - const char* magic = "PACK"; - unsigned long version = 3; - unsigned long zero = 0; + struct pack_header hdr; - version = htonl(version); - ywrite(pack_fd, (char*)magic, 4); - ywrite(pack_fd, &version, 4); - ywrite(pack_fd, &zero, 4); - pack_offset = 4 * 3; + hdr.hdr_signature = htonl(PACK_SIGNATURE); + hdr.hdr_version = htonl(2); + hdr.hdr_entries = 0; + + ywrite(pack_fd, &hdr, sizeof(hdr)); + pack_offset = sizeof(hdr); } static void fixup_header_footer() From ace4a9d1ae5efd056c5e57cc76aacee3057a73f7 Mon Sep 17 00:00:00 2001 From: "Shawn O. Pearce" Date: Mon, 21 Aug 2006 03:29:13 -0400 Subject: [PATCH 11/81] Allow symlink blobs in trees during fast-import. If a frontend is smart enough to import a symlink then we should let them do so. We'll assume that they were smart enough to first generate a blob to hold the link target, as that's how symlinks get represented in GIT. Signed-off-by: Shawn O. Pearce --- fast-import.c | 1 + 1 file changed, 1 insertion(+) diff --git a/fast-import.c b/fast-import.c index d5651693ba..7d1ee1dad9 100644 --- a/fast-import.c +++ b/fast-import.c @@ -1017,6 +1017,7 @@ static void file_change_m(struct branch *b) switch (mode) { case S_IFREG | 0644: case S_IFREG | 0755: + case S_IFLNK: case 0644: case 0755: /* ok */ From afde8dd96dbb81688d7cb22330e4fffcfc7def21 Mon Sep 17 00:00:00 2001 From: "Shawn O. Pearce" Date: Wed, 23 Aug 2006 01:33:47 -0400 Subject: [PATCH 12/81] Fixed segfault in fast-import after growing a tree. Growing a tree caused all subtrees to be deallocated and put back into the free list yet those subtree's contents were still actively in use. Consequently they were doled out again and got stomped on elsewhere. Releasing a tree is now performed in two parts, either releasing only the content array or releasing the content array and recursively releasing the subtree(s). Signed-off-by: Shawn O. Pearce --- fast-import.c | 15 ++++++++++----- 1 file changed, 10 insertions(+), 5 deletions(-) diff --git a/fast-import.c b/fast-import.c index 7d1ee1dad9..4c2431f0b0 100644 --- a/fast-import.c +++ b/fast-import.c @@ -420,11 +420,16 @@ static void release_tree_content(struct tree_content *t) { struct avail_tree_content *f = (struct avail_tree_content*)t; unsigned int hc = hc_entries(f->entry_capacity); + f->next_avail = avail_tree_table[hc]; + avail_tree_table[hc] = f; +} + +static void release_tree_content_recursive(struct tree_content *t) +{ unsigned int i; for (i = 0; i < t->entry_count; i++) release_tree_entry(t->entries[i]); - f->next_avail = avail_tree_table[hc]; - avail_tree_table[hc] = f; + release_tree_content(t); } static struct tree_content* grow_tree_content( @@ -459,7 +464,7 @@ static struct tree_entry* new_tree_entry() static void release_tree_entry(struct tree_entry *e) { if (e->tree) - release_tree_content(e->tree); + release_tree_content_recursive(e->tree); *((void**)e) = avail_tree_entry; avail_tree_entry = e; } @@ -720,7 +725,7 @@ static int tree_content_set( e->mode = mode; memcpy(e->sha1, sha1, 20); if (e->tree) { - release_tree_content(e->tree); + release_tree_content_recursive(e->tree); e->tree = NULL; } memcpy(root->sha1, null_sha1, 20); @@ -986,7 +991,7 @@ static void unload_one_branch() } e->active_next_branch = NULL; if (e->branch_tree.tree) { - release_tree_content(e->branch_tree.tree); + release_tree_content_recursive(e->branch_tree.tree); e->branch_tree.tree = NULL; } cur_active_branches--; From d5c57b284e847a56cc1d98b783be95ba94285afe Mon Sep 17 00:00:00 2001 From: "Shawn O. Pearce" Date: Wed, 23 Aug 2006 02:00:31 -0400 Subject: [PATCH 13/81] Converted fast-import to accept standard command line parameters. The following command line options are now accepted before the pack name: --objects=n # replaces the object count after the pack name --depth=n # delta chain depth to use (default is 10) --active-branches=n # maximum number of branches to keep in memory Signed-off-by: Shawn O. Pearce --- fast-import.c | 34 ++++++++++++++++++++++++++++------ 1 file changed, 28 insertions(+), 6 deletions(-) diff --git a/fast-import.c b/fast-import.c index 4c2431f0b0..8598493651 100644 --- a/fast-import.c +++ b/fast-import.c @@ -178,7 +178,7 @@ struct branch /* Stats and misc. counters */ -static int max_depth = 10; +static unsigned long max_depth = 10; static unsigned long alloc_count; static unsigned long branch_count; static unsigned long object_count; @@ -216,9 +216,9 @@ static unsigned int avail_tree_table_sz = 100; static struct avail_tree_content **avail_tree_table; /* Branch data */ -static unsigned int max_active_branches = 5; -static unsigned int cur_active_branches; -static unsigned int branch_table_sz = 1039; +static unsigned long max_active_branches = 5; +static unsigned long cur_active_branches; +static unsigned long branch_table_sz = 1039; static struct branch **branch_table; static struct branch *active_branches; @@ -1236,10 +1236,14 @@ static void cmd_new_branch() die("An lf did not terminate the branch command as expected."); } +static const char fast_import_usage[] = +"git-fast-import [--objects=n] [--depth=n] [--active-branches=n] temp.pack"; + int main(int argc, const char **argv) { - const char *base_name = argv[1]; - int est_obj_cnt = atoi(argv[2]); + const char *base_name; + int i; + unsigned long est_obj_cnt = 1000; char *pack_name; char *idx_name; struct stat sb; @@ -1247,6 +1251,24 @@ int main(int argc, const char **argv) setup_ident(); git_config(git_default_config); + for (i = 1; i < argc; i++) { + const char *a = argv[i]; + + if (*a != '-' || !strcmp(a, "--")) + break; + else if (!strncmp(a, "--objects=", 10)) + est_obj_cnt = strtoul(a + 10, NULL, 0); + else if (!strncmp(a, "--depth=", 8)) + max_depth = strtoul(a + 8, NULL, 0); + else if (!strncmp(a, "--active-branches=", 18)) + max_active_branches = strtoul(a + 18, NULL, 0); + else + die("unknown option %s", a); + } + if ((i+1) != argc) + usage(fast_import_usage); + base_name = argv[i]; + pack_name = xmalloc(strlen(base_name) + 6); sprintf(pack_name, "%s.pack", base_name); idx_name = xmalloc(strlen(base_name) + 5); From d83971688ba42e4cd37908f4d776801a997ca421 Mon Sep 17 00:00:00 2001 From: "Shawn O. Pearce" Date: Wed, 23 Aug 2006 04:17:45 -0400 Subject: [PATCH 14/81] Added mark store/find to fast-import. Marks are now saved when the mark directive gets used by the frontend and may be used in place of a SHA1 expression to locate a previous SHA1 which fast-import may have generated. This is particularly useful with commits where the frontend does not (easily) have the ability to compute the SHA1 for an arbitrary commit but needs it to generate a branch or tag from that commit. Signed-off-by: Shawn O. Pearce --- fast-import.c | 104 +++++++++++++++++++++++++++++++++++++++++--------- 1 file changed, 87 insertions(+), 17 deletions(-) diff --git a/fast-import.c b/fast-import.c index 8598493651..6e2f106a1a 100644 --- a/fast-import.c +++ b/fast-import.c @@ -121,6 +121,15 @@ struct object_entry_pool struct object_entry entries[FLEX_ARRAY]; /* more */ }; +struct mark_set +{ + int shift; + union { + struct object_entry *marked[1024]; + struct mark_set *sets[1024]; + } data; +}; + struct last_object { void *data; @@ -183,6 +192,7 @@ static unsigned long alloc_count; static unsigned long branch_count; static unsigned long object_count; static unsigned long duplicate_count; +static unsigned long marks_set_count; static unsigned long object_count_by_type[9]; static unsigned long duplicate_count_by_type[9]; @@ -205,6 +215,7 @@ static unsigned char pack_sha1[20]; static unsigned int object_entry_alloc = 1000; static struct object_entry_pool *blocks; static struct object_entry *object_table[1 << 16]; +static struct mark_set *marks; /* Our last blob */ static struct last_object last_blob; @@ -224,7 +235,7 @@ static struct branch *active_branches; /* Input stream parsing */ static struct strbuf command_buf; -static unsigned long command_mark; +static unsigned long next_mark; static void alloc_objects(int cnt) @@ -335,6 +346,48 @@ static char* pool_strdup(const char *s) return r; } +static void insert_mark(unsigned long idnum, struct object_entry *oe) +{ + struct mark_set *s = marks; + while ((idnum >> s->shift) >= 1024) { + s = pool_calloc(1, sizeof(struct mark_set)); + s->shift = marks->shift + 10; + s->data.sets[0] = marks; + marks = s; + } + while (s->shift) { + unsigned long i = idnum >> s->shift; + idnum -= i << s->shift; + if (!s->data.sets[i]) { + s->data.sets[i] = pool_calloc(1, sizeof(struct mark_set)); + s->data.sets[i]->shift = s->shift - 10; + } + s = s->data.sets[i]; + } + if (!s->data.marked[idnum]) + marks_set_count++; + s->data.marked[idnum] = oe; +} + +static struct object_entry* find_mark(unsigned long idnum) +{ + unsigned long orig_idnum = idnum; + struct mark_set *s = marks; + struct object_entry *oe = NULL; + if ((idnum >> s->shift) < 1024) { + while (s && s->shift) { + unsigned long i = idnum >> s->shift; + idnum -= i << s->shift; + s = s->data.sets[i]; + } + if (s) + oe = s->data.marked[idnum]; + } + if (!oe) + die("mark :%lu not declared", orig_idnum); + return oe; +} + static struct atom_str* to_atom(const char *s, size_t len) { unsigned int hc = hc_str(s, len) % atom_table_sz; @@ -523,7 +576,8 @@ static int store_object( void *dat, size_t datlen, struct last_object *last, - unsigned char *sha1out) + unsigned char *sha1out, + unsigned long mark) { void *out, *delta; struct object_entry *e; @@ -542,6 +596,8 @@ static int store_object( memcpy(sha1out, sha1, sizeof(sha1)); e = insert_object(sha1); + if (mark) + insert_mark(mark, e); if (e->offset) { duplicate_count++; duplicate_count_by_type[type]++; @@ -695,7 +751,7 @@ static void store_tree(struct tree_entry *root) memcpy(c, e->sha1, 20); c += 20; } - store_object(OBJ_TREE, buf, c - buf, NULL, root->sha1); + store_object(OBJ_TREE, buf, c - buf, NULL, root->sha1, 0); free(buf); } @@ -921,11 +977,11 @@ static void read_next_command() static void cmd_mark() { if (!strncmp("mark :", command_buf.buf, 6)) { - command_mark = strtoul(command_buf.buf + 6, NULL, 10); + next_mark = strtoul(command_buf.buf + 6, NULL, 10); read_next_command(); } else - command_mark = 0; + next_mark = 0; } static void* cmd_data (size_t *size) @@ -956,16 +1012,15 @@ static void* cmd_data (size_t *size) static void cmd_new_blob() { - size_t datlen; - void *dat; - unsigned char sha1[20]; + size_t l; + void *d; read_next_command(); cmd_mark(); - dat = cmd_data(&datlen); + d = cmd_data(&l); - if (store_object(OBJ_BLOB, dat, datlen, &last_blob, sha1)) - free(dat); + if (store_object(OBJ_BLOB, d, l, &last_blob, NULL, next_mark)) + free(d); } static void unload_one_branch() @@ -1031,9 +1086,16 @@ static void file_change_m(struct branch *b) die("Corrupt mode: %s", command_buf.buf); } - if (get_sha1_hex(p, sha1)) - die("Invalid SHA1: %s", command_buf.buf); - p += 40; + if (*p == ':') { + char *x; + oe = find_mark(strtoul(p + 1, &x, 10)); + p = x; + } else { + if (get_sha1_hex(p, sha1)) + die("Invalid SHA1: %s", command_buf.buf); + oe = find_object(sha1); + p += 40; + } if (*p++ != ' ') die("Missing space after SHA1: %s", command_buf.buf); @@ -1044,7 +1106,6 @@ static void file_change_m(struct branch *b) p = p_uq; } - oe = find_object(sha1); if (oe) { if (oe->type != OBJ_BLOB) die("Not a blob (actually a %s): %s", @@ -1161,7 +1222,7 @@ static void cmd_new_commit() free(committer); free(msg); - store_object(OBJ_COMMIT, body, sp - body, NULL, b->sha1); + store_object(OBJ_COMMIT, body, sp - body, NULL, b->sha1, next_mark); free(body); b->last_commit = object_count_by_type[OBJ_COMMIT]; } @@ -1205,6 +1266,13 @@ static void cmd_new_branch() else if (s) { memcpy(b->sha1, s->sha1, 20); memcpy(b->branch_tree.sha1, s->branch_tree.sha1, 20); + } else if (*from == ':') { + unsigned long idnum = strtoul(from + 1, NULL, 10); + struct object_entry *oe = find_mark(idnum); + if (oe->type != OBJ_COMMIT) + die("Mark :%lu not a commit", idnum); + memcpy(b->sha1, oe->sha1, 20); + memcpy(b->branch_tree.sha1, null_sha1, 20); } else if (!get_sha1(from, b->sha1)) { if (!memcmp(b->sha1, null_sha1, 20)) memcpy(b->branch_tree.sha1, null_sha1, 20); @@ -1285,6 +1353,7 @@ int main(int argc, const char **argv) atom_table = xcalloc(atom_table_sz, sizeof(struct atom_str*)); branch_table = xcalloc(branch_table_sz, sizeof(struct branch*)); avail_tree_table = xcalloc(avail_tree_table_sz, sizeof(struct avail_tree_content*)); + marks = pool_calloc(1, sizeof(struct mark_set)); for (;;) { read_next_command(); @@ -1314,7 +1383,8 @@ int main(int argc, const char **argv) fprintf(stderr, " commits: %10lu (%10lu duplicates)\n", object_count_by_type[OBJ_COMMIT], duplicate_count_by_type[OBJ_COMMIT]); fprintf(stderr, " tags : %10lu (%10lu duplicates)\n", object_count_by_type[OBJ_TAG], duplicate_count_by_type[OBJ_TAG]); fprintf(stderr, "Total branches: %10lu\n", branch_count); - fprintf(stderr, "Total atoms: %10u\n", atom_cnt); + fprintf(stderr, " atoms: %10u\n", atom_cnt); + fprintf(stderr, " marks: %10u (%10lu unique )\n", (1 << marks->shift) * 1024, marks_set_count); fprintf(stderr, "Memory total: %10lu KiB\n", (total_allocd + alloc_count*sizeof(struct object_entry))/1024); fprintf(stderr, " pools: %10lu KiB\n", total_allocd/1024); fprintf(stderr, " objects: %10lu KiB\n", (alloc_count*sizeof(struct object_entry))/1024); From d6c7eb2c160fc40c48fd25fdae15c193eec13bb7 Mon Sep 17 00:00:00 2001 From: "Shawn O. Pearce" Date: Wed, 23 Aug 2006 04:31:12 -0400 Subject: [PATCH 15/81] Added branch load counter to fast-import. If the branch load count exceeds the number of branches created then the frontend is causing fast-import to page branches into and out of memory due to the way its ordering its commits. Performance can likely be increased if the frontend were to alter its commit sequence such that it stays on one branch before switching to another branch, then never returns to the prior branch. Signed-off-by: Shawn O. Pearce --- fast-import.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/fast-import.c b/fast-import.c index 6e2f106a1a..50171d69ca 100644 --- a/fast-import.c +++ b/fast-import.c @@ -190,6 +190,7 @@ struct branch static unsigned long max_depth = 10; static unsigned long alloc_count; static unsigned long branch_count; +static unsigned long branch_load_count; static unsigned long object_count; static unsigned long duplicate_count; static unsigned long marks_set_count; @@ -1059,6 +1060,7 @@ static void load_branch(struct branch *b) b->active_next_branch = active_branches; active_branches = b; cur_active_branches++; + branch_load_count++; } static void file_change_m(struct branch *b) @@ -1382,9 +1384,9 @@ int main(int argc, const char **argv) fprintf(stderr, " trees : %10lu (%10lu duplicates)\n", object_count_by_type[OBJ_TREE], duplicate_count_by_type[OBJ_TREE]); fprintf(stderr, " commits: %10lu (%10lu duplicates)\n", object_count_by_type[OBJ_COMMIT], duplicate_count_by_type[OBJ_COMMIT]); fprintf(stderr, " tags : %10lu (%10lu duplicates)\n", object_count_by_type[OBJ_TAG], duplicate_count_by_type[OBJ_TAG]); - fprintf(stderr, "Total branches: %10lu\n", branch_count); - fprintf(stderr, " atoms: %10u\n", atom_cnt); + fprintf(stderr, "Total branches: %10lu (%10lu loads )\n", branch_count, branch_load_count); fprintf(stderr, " marks: %10u (%10lu unique )\n", (1 << marks->shift) * 1024, marks_set_count); + fprintf(stderr, " atoms: %10u\n", atom_cnt); fprintf(stderr, "Memory total: %10lu KiB\n", (total_allocd + alloc_count*sizeof(struct object_entry))/1024); fprintf(stderr, " pools: %10lu KiB\n", total_allocd/1024); fprintf(stderr, " objects: %10lu KiB\n", (alloc_count*sizeof(struct object_entry))/1024); From 72303d44e9d8f3fc9bef039b472a2bd259509420 Mon Sep 17 00:00:00 2001 From: "Shawn O. Pearce" Date: Thu, 24 Aug 2006 03:12:13 -0400 Subject: [PATCH 16/81] Implemented 'tag' command in fast-import. Tags received from the frontend are generated in memory in a simple linked list in the order that the tag commands were sent by the frontend. If multiple different tag objects for the same tag name get generated the last one sent by the frontend will be the one that gets written out at termination. Multiple tag objects for the same name will cause all older tags of the same name to be lost. Signed-off-by: Shawn O. Pearce --- fast-import.c | 125 ++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 125 insertions(+) diff --git a/fast-import.c b/fast-import.c index 50171d69ca..e692f6b430 100644 --- a/fast-import.c +++ b/fast-import.c @@ -185,6 +185,13 @@ struct branch unsigned char sha1[20]; }; +struct tag +{ + struct tag *next_tag; + const char *name; + unsigned char sha1[20]; +}; + /* Stats and misc. counters */ static unsigned long max_depth = 10; @@ -234,6 +241,10 @@ static unsigned long branch_table_sz = 1039; static struct branch **branch_table; static struct branch *active_branches; +/* Tag data */ +static struct tag *first_tag; +static struct tag *last_tag; + /* Input stream parsing */ static struct strbuf command_buf; static unsigned long next_mark; @@ -970,6 +981,21 @@ static void dump_branches() } } +static void dump_tags() +{ + static const char *msg = "fast-import"; + struct tag *t; + struct ref_lock *lock; + char path[PATH_MAX]; + + for (t = first_tag; t; t = t->next_tag) { + sprintf(path, "refs/tags/%s", t->name); + lock = lock_any_ref_for_update(path, NULL, 0); + if (!lock || write_ref_sha1(lock, t->sha1, msg) < 0) + die("Can't write %s", path); + } +} + static void read_next_command() { read_line(&command_buf, stdin, '\n'); @@ -1306,6 +1332,102 @@ static void cmd_new_branch() die("An lf did not terminate the branch command as expected."); } +static void cmd_new_tag() +{ + char *str_uq; + const char *endp; + char *sp; + const char *from; + char *tagger; + struct branch *s; + void *msg; + size_t msglen; + char *body; + struct tag *t; + unsigned char sha1[20]; + + /* Obtain the new tag name from the rest of our command */ + sp = strchr(command_buf.buf, ' ') + 1; + str_uq = unquote_c_style(sp, &endp); + if (str_uq) { + if (*endp) + die("Garbage after tag name in: %s", command_buf.buf); + sp = str_uq; + } + t = pool_alloc(sizeof(struct tag)); + t->next_tag = NULL; + t->name = pool_strdup(sp); + if (last_tag) + last_tag->next_tag = t; + else + first_tag = t; + last_tag = t; + if (str_uq) + free(str_uq); + read_next_command(); + + /* from ... */ + if (strncmp("from ", command_buf.buf, 5)) + die("Expected from command, got %s", command_buf.buf); + + from = strchr(command_buf.buf, ' ') + 1; + str_uq = unquote_c_style(from, &endp); + if (str_uq) { + if (*endp) + die("Garbage after string in: %s", command_buf.buf); + from = str_uq; + } + + s = lookup_branch(from); + if (s) { + memcpy(sha1, s->sha1, 20); + } else if (*from == ':') { + unsigned long idnum = strtoul(from + 1, NULL, 10); + struct object_entry *oe = find_mark(idnum); + if (oe->type != OBJ_COMMIT) + die("Mark :%lu not a commit", idnum); + memcpy(sha1, oe->sha1, 20); + } else if (!get_sha1(from, sha1)) { + unsigned long size; + char *buf; + + buf = read_object_with_reference(sha1, + type_names[OBJ_COMMIT], &size, sha1); + if (!buf || size < 46) + die("Not a valid commit: %s", from); + free(buf); + } else + die("Invalid ref name or SHA1 expression: %s", from); + + if (str_uq) + free(str_uq); + read_next_command(); + + /* tagger ... */ + if (strncmp("tagger ", command_buf.buf, 7)) + die("Expected tagger command, got %s", command_buf.buf); + tagger = strdup(command_buf.buf); + + /* tag payload/message */ + read_next_command(); + msg = cmd_data(&msglen); + + /* build the tag object */ + body = xmalloc(67 + strlen(t->name) + strlen(tagger) + msglen); + sp = body; + sp += sprintf(sp, "object %s\n", sha1_to_hex(sha1)); + sp += sprintf(sp, "type %s\n", type_names[OBJ_COMMIT]); + sp += sprintf(sp, "tag %s\n", t->name); + sp += sprintf(sp, "%s\n\n", tagger); + memcpy(sp, msg, msglen); + sp += msglen; + free(tagger); + free(msg); + + store_object(OBJ_TAG, body, sp - body, NULL, t->sha1, 0); + free(body); +} + static const char fast_import_usage[] = "git-fast-import [--objects=n] [--depth=n] [--active-branches=n] temp.pack"; @@ -1367,6 +1489,8 @@ int main(int argc, const char **argv) cmd_new_branch(); else if (!strncmp("commit ", command_buf.buf, 7)) cmd_new_commit(); + else if (!strncmp("tag ", command_buf.buf, 4)) + cmd_new_tag(); else die("Unsupported command: %s", command_buf.buf); } @@ -1375,6 +1499,7 @@ int main(int argc, const char **argv) close(pack_fd); write_index(idx_name); dump_branches(); + dump_tags(); fprintf(stderr, "%s statistics:\n", argv[0]); fprintf(stderr, "---------------------------------------------------\n"); From 41e5257fcf4db31dfa2576aac1f50b140f2bb058 Mon Sep 17 00:00:00 2001 From: "Shawn O. Pearce" Date: Thu, 24 Aug 2006 04:37:35 -0400 Subject: [PATCH 17/81] Implemented tree reloading in fast-import. Tree reloading allows fast-import to swap out the least-recently used branch by simply deallocating the data structures from memory that were associated with that branch. Later if the branch becomes active again it can lazily recreate those structures on demand by reloading the necessary trees from the pack file it originally wrote them to. The reloading process is implemented by mmap'ing the pack into memory and using a much tighter variant of the pack reading code contained in sha1_file.c. This was a blatent copy from sha1_file.c but the unpacking functions were significantly simplified and are actually now in a form that should make it easier to map only the necessary regions of a pack rather than the entire file. Signed-off-by: Shawn O. Pearce --- fast-import.c | 162 ++++++++++++++++++++++++++++++++++++++++++++++---- 1 file changed, 149 insertions(+), 13 deletions(-) diff --git a/fast-import.c b/fast-import.c index e692f6b430..1c74b90c84 100644 --- a/fast-import.c +++ b/fast-import.c @@ -198,6 +198,7 @@ static unsigned long max_depth = 10; static unsigned long alloc_count; static unsigned long branch_count; static unsigned long branch_load_count; +static unsigned long remap_count; static unsigned long object_count; static unsigned long duplicate_count; static unsigned long marks_set_count; @@ -216,8 +217,10 @@ static struct atom_str **atom_table; /* The .pack file being generated */ static int pack_fd; -static unsigned long pack_offset; +static unsigned long pack_size; static unsigned char pack_sha1[20]; +static void* pack_base; +static size_t pack_mlen; /* Table of objects we've written. */ static unsigned int object_entry_alloc = 1000; @@ -616,7 +619,7 @@ static int store_object( return 1; } e->type = type; - e->offset = pack_offset; + e->offset = pack_size; object_count++; object_count_by_type[type]++; @@ -637,7 +640,7 @@ static int store_object( hdrlen = encode_header(OBJ_DELTA, deltalen, hdr); ywrite(pack_fd, hdr, hdrlen); ywrite(pack_fd, last->sha1, sizeof(sha1)); - pack_offset += hdrlen + sizeof(sha1); + pack_size += hdrlen + sizeof(sha1); } else { if (last) last->depth = 0; @@ -645,7 +648,7 @@ static int store_object( s.avail_in = datlen; hdrlen = encode_header(type, datlen, hdr); ywrite(pack_fd, hdr, hdrlen); - pack_offset += hdrlen; + pack_size += hdrlen; } s.avail_out = deflateBound(&s, s.avail_in); @@ -655,7 +658,7 @@ static int store_object( deflateEnd(&s); ywrite(pack_fd, out, s.total_out); - pack_offset += s.total_out; + pack_size += s.total_out; free(out); if (delta) @@ -670,6 +673,127 @@ static int store_object( return 0; } +static void* map_pack(unsigned long offset) +{ + if (offset >= pack_size) + die("object offset outside of pack file"); + if (offset >= pack_mlen) { + if (pack_base) + munmap(pack_base, pack_mlen); + /* round out how much we map to 16 MB units */ + pack_mlen = pack_size; + if (pack_mlen & ((1 << 24) - 1)) + pack_mlen = ((pack_mlen >> 24) + 1) << 24; + pack_base = mmap(NULL,pack_mlen,PROT_READ,MAP_SHARED,pack_fd,0); + if (pack_base == MAP_FAILED) + die("Failed to map generated pack: %s", strerror(errno)); + remap_count++; + } + return (char*)pack_base + offset; +} + +static unsigned long unpack_object_header(unsigned long offset, + enum object_type *type, + unsigned long *sizep) +{ + unsigned shift; + unsigned char c; + unsigned long size; + + c = *(unsigned char*)map_pack(offset++); + *type = (c >> 4) & 7; + size = c & 15; + shift = 4; + while (c & 0x80) { + c = *(unsigned char*)map_pack(offset++); + size += (c & 0x7f) << shift; + shift += 7; + } + *sizep = size; + return offset; +} + +static void *unpack_non_delta_entry(unsigned long o, unsigned long sz) +{ + z_stream stream; + unsigned char *result; + + result = xmalloc(sz + 1); + result[sz] = 0; + + memset(&stream, 0, sizeof(stream)); + stream.next_in = map_pack(o); + stream.avail_in = pack_mlen - o; + stream.next_out = result; + stream.avail_out = sz; + + inflateInit(&stream); + for (;;) { + int st = inflate(&stream, Z_FINISH); + if (st == Z_STREAM_END) + break; + if (st == Z_OK) { + o = stream.next_in - (unsigned char*)pack_base; + stream.next_in = map_pack(o); + stream.avail_in = pack_mlen - o; + continue; + } + die("Error from zlib during inflate."); + } + inflateEnd(&stream); + if (stream.total_out != sz) + die("Error after inflate: sizes mismatch"); + return result; +} + +static void *unpack_entry(unsigned long offset, unsigned long *sizep); + +static void *unpack_delta_entry(unsigned long offset, + unsigned long delta_size, + unsigned long *sizep) +{ + struct object_entry *base_oe; + unsigned char *base_sha1; + void *delta_data, *base, *result; + unsigned long base_size, result_size; + + base_sha1 = (unsigned char*)map_pack(offset + 20) - 20; + base_oe = find_object(base_sha1); + if (!base_oe) + die("I'm broken; I can't find a base I know must be here."); + base = unpack_entry(base_oe->offset, &base_size); + delta_data = unpack_non_delta_entry(offset + 20, delta_size); + result = patch_delta(base, base_size, + delta_data, delta_size, + &result_size); + if (!result) + die("failed to apply delta"); + free(delta_data); + free(base); + *sizep = result_size; + return result; +} + +static void *unpack_entry(unsigned long offset, unsigned long *sizep) +{ + unsigned long size; + enum object_type kind; + + offset = unpack_object_header(offset, &kind, &size); + switch (kind) { + case OBJ_DELTA: + return unpack_delta_entry(offset, size, sizep); + case OBJ_COMMIT: + case OBJ_TREE: + case OBJ_BLOB: + case OBJ_TAG: + *sizep = size; + return unpack_non_delta_entry(offset, size); + default: + die("I created an object I can't read!"); + } +} + static const char *get_mode(const char *str, unsigned int *modep) { unsigned char c; @@ -691,7 +815,6 @@ static void load_tree(struct tree_entry *root) unsigned long size; char *buf; const char *c; - char type[20]; root->tree = t = new_tree_content(8); if (!memcmp(root->sha1, null_sha1, 20)) @@ -699,11 +822,14 @@ static void load_tree(struct tree_entry *root) myoe = find_object(root->sha1); if (myoe) { - die("FIXME"); + if (myoe->type != OBJ_TREE) + die("Not a tree: %s", sha1_to_hex(root->sha1)); + buf = unpack_entry(myoe->offset, &size); } else { + char type[20]; buf = read_sha1_file(root->sha1, type, &size); - if (!buf || strcmp(type, tree_type)) - die("Can't load existing tree %s", sha1_to_hex(root->sha1)); + if (!buf || !strcmp(type, tree_type)) + die("Can't load tree %s", sha1_to_hex(root->sha1)); } c = buf; @@ -880,7 +1006,7 @@ static void init_pack_header() hdr.hdr_entries = 0; ywrite(pack_fd, &hdr, sizeof(hdr)); - pack_offset = sizeof(hdr); + pack_size = sizeof(hdr); } static void fixup_header_footer() @@ -1052,7 +1178,8 @@ static void cmd_new_blob() static void unload_one_branch() { - while (cur_active_branches >= max_active_branches) { + while (cur_active_branches + && cur_active_branches >= max_active_branches) { unsigned long min_commit = ULONG_MAX; struct branch *e, *l = NULL, *p = NULL; @@ -1210,7 +1337,7 @@ static void cmd_new_commit() msg = cmd_data(&msglen); /* ensure the branch is active/loaded */ - if (!b->branch_tree.tree) { + if (!b->branch_tree.tree || !max_active_branches) { unload_one_branch(); load_branch(b); } @@ -1297,10 +1424,18 @@ static void cmd_new_branch() } else if (*from == ':') { unsigned long idnum = strtoul(from + 1, NULL, 10); struct object_entry *oe = find_mark(idnum); + unsigned long size; + char *buf; if (oe->type != OBJ_COMMIT) die("Mark :%lu not a commit", idnum); memcpy(b->sha1, oe->sha1, 20); - memcpy(b->branch_tree.sha1, null_sha1, 20); + buf = unpack_entry(oe->offset, &size); + if (!buf || size < 46) + die("Not a valid commit: %s", from); + if (memcmp("tree ", buf, 5) + || get_sha1_hex(buf + 5, b->branch_tree.sha1)) + die("The commit %s is corrupt", sha1_to_hex(b->sha1)); + free(buf); } else if (!get_sha1(from, b->sha1)) { if (!memcmp(b->sha1, null_sha1, 20)) memcpy(b->branch_tree.sha1, null_sha1, 20); @@ -1515,6 +1650,7 @@ int main(int argc, const char **argv) fprintf(stderr, "Memory total: %10lu KiB\n", (total_allocd + alloc_count*sizeof(struct object_entry))/1024); fprintf(stderr, " pools: %10lu KiB\n", total_allocd/1024); fprintf(stderr, " objects: %10lu KiB\n", (alloc_count*sizeof(struct object_entry))/1024); + fprintf(stderr, "Pack remaps: %10lu\n", remap_count); fprintf(stderr, "---------------------------------------------------\n"); stat(pack_name, &sb); From 8d8928b0511313ba1740d39c3920f8f12f36a10a Mon Sep 17 00:00:00 2001 From: "Shawn O. Pearce" Date: Thu, 24 Aug 2006 04:46:29 -0400 Subject: [PATCH 18/81] Round out memory pool allocations in fast-import to pointer sizes. Some architectures (e.g. SPARC) would require that we access pointers only on pointer-sized alignments. So ensure the pool allocator rounds out non-pointer sized allocations to the next pointer so we don't generate bad memory addresses. This could have occurred if we had previously allocated an atom whose string was not a whole multiple of the pointer size, for example. Signed-off-by: Shawn O. Pearce --- fast-import.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/fast-import.c b/fast-import.c index 1c74b90c84..e42bdbd3a3 100644 --- a/fast-import.c +++ b/fast-import.c @@ -342,6 +342,9 @@ static void* pool_alloc(size_t len) } r = p->next_free; + /* round out to a pointer alignment */ + if (len & (sizeof(void*) - 1)) + len += sizeof(void*) - (len & (sizeof(void*) - 1)); p->next_free += len; return r; } From 00e2b8842c58e451fcf8038287c8420423bab50a Mon Sep 17 00:00:00 2001 From: "Shawn O. Pearce" Date: Thu, 24 Aug 2006 18:45:26 -0400 Subject: [PATCH 19/81] Remove branch creation command from fast-import. Jon Smirl was finding it difficult to alter cvs2svn to generate branch commands prior to the first commit of the same branch. This change moves the 'from' command to be an optional parameter of the 'commit' command, thereby allowing a new branch to be defined at the moment it gets used to create the first commit on that branch. This change makes it impossible to create a branch with no commits on it as at least one commit is needed to register the branch. Signed-off-by: Shawn O. Pearce --- fast-import.c | 169 +++++++++++++++++++++----------------------------- 1 file changed, 71 insertions(+), 98 deletions(-) diff --git a/fast-import.c b/fast-import.c index e42bdbd3a3..3e527edf70 100644 --- a/fast-import.c +++ b/fast-import.c @@ -4,7 +4,6 @@ Format of STDIN stream: stream ::= cmd*; cmd ::= new_blob - | new_branch | new_commit | new_tag ; @@ -14,15 +13,12 @@ Format of STDIN stream: file_content; file_content ::= data; - new_branch ::= 'branch' sp ref_str lf - ('from' sp (ref_str | hexsha1 | sha1exp_str | idnum) lf)? - lf; - new_commit ::= 'commit' sp ref_str lf - mark? - ('author' sp name '<' email '>' ts tz lf)? - 'committer' sp name '<' email '>' ts tz lf - commit_msg + ('from' sp (ref_str | hexsha1 | sha1exp_str | idnum) lf)? + mark? + ('author' sp name '<' email '>' ts tz lf)? + 'committer' sp name '<' email '>' ts tz lf + commit_msg file_change* lf; commit_msg ::= data; @@ -831,7 +827,7 @@ static void load_tree(struct tree_entry *root) } else { char type[20]; buf = read_sha1_file(root->sha1, type, &size); - if (!buf || !strcmp(type, tree_type)) + if (!buf || strcmp(type, tree_type)) die("Can't load tree %s", sha1_to_hex(root->sha1)); } @@ -1299,6 +1295,69 @@ static void file_change_d(struct branch *b) free(p_uq); } +static void cmd_from(struct branch *b) +{ + const char *from, *endp; + char *str_uq; + struct branch *s; + + if (strncmp("from ", command_buf.buf, 5)) + return; + + if (b->last_commit) + die("Can't reinitailize branch %s", b->name); + + from = strchr(command_buf.buf, ' ') + 1; + str_uq = unquote_c_style(from, &endp); + if (str_uq) { + if (*endp) + die("Garbage after string in: %s", command_buf.buf); + from = str_uq; + } + + s = lookup_branch(from); + if (b == s) + die("Can't create a branch from itself: %s", b->name); + else if (s) { + memcpy(b->sha1, s->sha1, 20); + memcpy(b->branch_tree.sha1, s->branch_tree.sha1, 20); + } else if (*from == ':') { + unsigned long idnum = strtoul(from + 1, NULL, 10); + struct object_entry *oe = find_mark(idnum); + unsigned long size; + char *buf; + if (oe->type != OBJ_COMMIT) + die("Mark :%lu not a commit", idnum); + memcpy(b->sha1, oe->sha1, 20); + buf = unpack_entry(oe->offset, &size); + if (!buf || size < 46) + die("Not a valid commit: %s", from); + if (memcmp("tree ", buf, 5) + || get_sha1_hex(buf + 5, b->branch_tree.sha1)) + die("The commit %s is corrupt", sha1_to_hex(b->sha1)); + free(buf); + } else if (!get_sha1(from, b->sha1)) { + if (!memcmp(b->sha1, null_sha1, 20)) + memcpy(b->branch_tree.sha1, null_sha1, 20); + else { + unsigned long size; + char *buf; + + buf = read_object_with_reference(b->sha1, + type_names[OBJ_COMMIT], &size, b->sha1); + if (!buf || size < 46) + die("Not a valid commit: %s", from); + if (memcmp("tree ", buf, 5) + || get_sha1_hex(buf + 5, b->branch_tree.sha1)) + die("The commit %s is corrupt", sha1_to_hex(b->sha1)); + free(buf); + } + } else + die("Invalid ref name or SHA1 expression: %s", from); + + read_next_command(); +} + static void cmd_new_commit() { struct branch *b; @@ -1321,11 +1380,12 @@ static void cmd_new_commit() } b = lookup_branch(sp); if (!b) - die("Branch not declared: %s", sp); + b = new_branch(sp); if (str_uq) free(str_uq); read_next_command(); + cmd_from(b); cmd_mark(); if (!strncmp("author ", command_buf.buf, 7)) { author = strdup(command_buf.buf); @@ -1385,91 +1445,6 @@ static void cmd_new_commit() b->last_commit = object_count_by_type[OBJ_COMMIT]; } -static void cmd_new_branch() -{ - struct branch *b; - char *str_uq; - const char *endp; - char *sp; - - /* Obtain the new branch name from the rest of our command */ - sp = strchr(command_buf.buf, ' ') + 1; - str_uq = unquote_c_style(sp, &endp); - if (str_uq) { - if (*endp) - die("Garbage after ref in: %s", command_buf.buf); - sp = str_uq; - } - b = new_branch(sp); - if (str_uq) - free(str_uq); - read_next_command(); - - /* from ... */ - if (!strncmp("from ", command_buf.buf, 5)) { - const char *from; - struct branch *s; - - from = strchr(command_buf.buf, ' ') + 1; - str_uq = unquote_c_style(from, &endp); - if (str_uq) { - if (*endp) - die("Garbage after string in: %s", command_buf.buf); - from = str_uq; - } - - s = lookup_branch(from); - if (b == s) - die("Can't create a branch from itself: %s", b->name); - else if (s) { - memcpy(b->sha1, s->sha1, 20); - memcpy(b->branch_tree.sha1, s->branch_tree.sha1, 20); - } else if (*from == ':') { - unsigned long idnum = strtoul(from + 1, NULL, 10); - struct object_entry *oe = find_mark(idnum); - unsigned long size; - char *buf; - if (oe->type != OBJ_COMMIT) - die("Mark :%lu not a commit", idnum); - memcpy(b->sha1, oe->sha1, 20); - buf = unpack_entry(oe->offset, &size); - if (!buf || size < 46) - die("Not a valid commit: %s", from); - if (memcmp("tree ", buf, 5) - || get_sha1_hex(buf + 5, b->branch_tree.sha1)) - die("The commit %s is corrupt", sha1_to_hex(b->sha1)); - free(buf); - } else if (!get_sha1(from, b->sha1)) { - if (!memcmp(b->sha1, null_sha1, 20)) - memcpy(b->branch_tree.sha1, null_sha1, 20); - else { - unsigned long size; - char *buf; - - buf = read_object_with_reference(b->sha1, - type_names[OBJ_COMMIT], &size, b->sha1); - if (!buf || size < 46) - die("Not a valid commit: %s", from); - if (memcmp("tree ", buf, 5) - || get_sha1_hex(buf + 5, b->branch_tree.sha1)) - die("The commit %s is corrupt", sha1_to_hex(b->sha1)); - free(buf); - } - } else - die("Invalid ref name or SHA1 expression: %s", from); - - if (str_uq) - free(str_uq); - read_next_command(); - } else { - memcpy(b->sha1, null_sha1, 20); - memcpy(b->branch_tree.sha1, null_sha1, 20); - } - - if (command_buf.eof || command_buf.len > 1) - die("An lf did not terminate the branch command as expected."); -} - static void cmd_new_tag() { char *str_uq; @@ -1623,8 +1598,6 @@ int main(int argc, const char **argv) break; else if (!strcmp("blob", command_buf.buf)) cmd_new_blob(); - else if (!strncmp("branch ", command_buf.buf, 7)) - cmd_new_branch(); else if (!strncmp("commit ", command_buf.buf, 7)) cmd_new_commit(); else if (!strncmp("tag ", command_buf.buf, 4)) From 02f3389d9647378ed864ff1cdfb6f0238b64ee91 Mon Sep 17 00:00:00 2001 From: "Shawn O. Pearce" Date: Thu, 24 Aug 2006 22:38:13 -0400 Subject: [PATCH 20/81] Moved from command to after data to help cvs2svn. cvs2svn has three phases: begin_commit, middle_commit, end_commit. The ancester is computed in the middle_commit phase. So its easier to generate a stream if the from command appears after the commit message itself but before the file change commands. Signed-off-by: Shawn O. Pearce --- fast-import.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/fast-import.c b/fast-import.c index 3e527edf70..1842d0738b 100644 --- a/fast-import.c +++ b/fast-import.c @@ -14,11 +14,11 @@ Format of STDIN stream: file_content ::= data; new_commit ::= 'commit' sp ref_str lf - ('from' sp (ref_str | hexsha1 | sha1exp_str | idnum) lf)? mark? ('author' sp name '<' email '>' ts tz lf)? 'committer' sp name '<' email '>' ts tz lf commit_msg + ('from' sp (ref_str | hexsha1 | sha1exp_str | idnum) lf)? file_change* lf; commit_msg ::= data; @@ -1385,7 +1385,6 @@ static void cmd_new_commit() free(str_uq); read_next_command(); - cmd_from(b); cmd_mark(); if (!strncmp("author ", command_buf.buf, 7)) { author = strdup(command_buf.buf); @@ -1398,6 +1397,8 @@ static void cmd_new_commit() if (!committer) die("Expected committer but didn't get one"); msg = cmd_data(&msglen); + read_next_command(); + cmd_from(b); /* ensure the branch is active/loaded */ if (!b->branch_tree.tree || !max_active_branches) { @@ -1407,7 +1408,6 @@ static void cmd_new_commit() /* file_change* */ for (;;) { - read_next_command(); if (1 == command_buf.len) break; else if (!strncmp("M ", command_buf.buf, 2)) @@ -1416,6 +1416,7 @@ static void cmd_new_commit() file_change_d(b); else die("Unsupported file_change: %s", command_buf.buf); + read_next_command(); } /* build the tree and the commit */ From 8435a9cb2662ca4326e96ea78d58b9376fb21f7e Mon Sep 17 00:00:00 2001 From: "Shawn O. Pearce" Date: Fri, 25 Aug 2006 14:53:32 -0400 Subject: [PATCH 21/81] Account for tree entry memory costs in fast-import. Signed-off-by: Shawn O. Pearce --- fast-import.c | 1 + 1 file changed, 1 insertion(+) diff --git a/fast-import.c b/fast-import.c index 1842d0738b..311db4e6d5 100644 --- a/fast-import.c +++ b/fast-import.c @@ -516,6 +516,7 @@ static struct tree_entry* new_tree_entry() if (!avail_tree_entry) { unsigned int n = tree_entry_alloc; + total_allocd += n * sizeof(struct tree_entry); avail_tree_entry = e = xmalloc(n * sizeof(struct tree_entry)); while (n--) { *((void**)e) = e + 1; From a6a1a831d9bdcdc0adb9a23ce450db08779c2871 Mon Sep 17 00:00:00 2001 From: "Shawn O. Pearce" Date: Fri, 25 Aug 2006 16:03:04 -0400 Subject: [PATCH 22/81] Added option to export the marks table when fast-import terminates. The marks table can be used by the frontend to load any commit after the import and compare it to whatever data the frontend knows about that commit. If the mark idnums can be easily correlated to some reference source then its relatively trivial to compare the GIT tree to the reference to verify the accuracy of the import. Signed-off-by: Shawn O. Pearce --- fast-import.c | 36 +++++++++++++++++++++++++++++++++++- 1 file changed, 35 insertions(+), 1 deletion(-) diff --git a/fast-import.c b/fast-import.c index 311db4e6d5..d61da3adec 100644 --- a/fast-import.c +++ b/fast-import.c @@ -223,6 +223,7 @@ static unsigned int object_entry_alloc = 1000; static struct object_entry_pool *blocks; static struct object_entry *object_table[1 << 16]; static struct mark_set *marks; +static const char* mark_file; /* Our last blob */ static struct last_object last_blob; @@ -1122,6 +1123,36 @@ static void dump_tags() } } +static void dump_marks_helper(FILE *f, + unsigned long base, + struct mark_set *m) +{ + int k; + if (m->shift) { + for (k = 0; k < 1024; k++) { + if (m->data.sets[k]) + dump_marks_helper(f, (base + k) << m->shift, + m->data.sets[k]); + } + } else { + for (k = 0; k < 1024; k++) { + if (m->data.marked[k]) + fprintf(f, "%lu,%s\n", base + k, + sha1_to_hex(m->data.marked[k]->sha1)); + } + } +} + +static void dump_marks() +{ + if (mark_file) + { + FILE *f = fopen(mark_file, "w"); + dump_marks_helper(f, 0, marks); + fclose(f); + } +} + static void read_next_command() { read_line(&command_buf, stdin, '\n'); @@ -1544,7 +1575,7 @@ static void cmd_new_tag() } static const char fast_import_usage[] = -"git-fast-import [--objects=n] [--depth=n] [--active-branches=n] temp.pack"; +"git-fast-import [--objects=n] [--depth=n] [--active-branches=n] [--export-marks=marks.file] temp.pack"; int main(int argc, const char **argv) { @@ -1569,6 +1600,8 @@ int main(int argc, const char **argv) max_depth = strtoul(a + 8, NULL, 0); else if (!strncmp(a, "--active-branches=", 18)) max_active_branches = strtoul(a + 18, NULL, 0); + else if (!strncmp(a, "--export-marks=", 15)) + mark_file = a + 15; else die("unknown option %s", a); } @@ -1613,6 +1646,7 @@ int main(int argc, const char **argv) write_index(idx_name); dump_branches(); dump_tags(); + dump_marks(); fprintf(stderr, "%s statistics:\n", argv[0]); fprintf(stderr, "---------------------------------------------------\n"); From 264244a0429e23616a6065f6f52a15711981a8db Mon Sep 17 00:00:00 2001 From: "Shawn O. Pearce" Date: Fri, 25 Aug 2006 23:07:06 -0400 Subject: [PATCH 23/81] Added --branch-log to option to fast-import. This option can be used to have a record of every commit, the mark (if supplied) and branch name of the commit recorded into a log file when the commit is generated. This log can be useful to verify the results of an import as the commits can be compared to some source repository matching commits through the mark value. Signed-off-by: Shawn O. Pearce --- fast-import.c | 42 +++++++++++++++++++++++++++++++++++++----- 1 file changed, 37 insertions(+), 5 deletions(-) diff --git a/fast-import.c b/fast-import.c index d61da3adec..8328e004bb 100644 --- a/fast-import.c +++ b/fast-import.c @@ -248,6 +248,7 @@ static struct tag *last_tag; /* Input stream parsing */ static struct strbuf command_buf; static unsigned long next_mark; +static FILE* branch_log; static void alloc_objects(int cnt) @@ -1137,7 +1138,7 @@ static void dump_marks_helper(FILE *f, } else { for (k = 0; k < 1024; k++) { if (m->data.marked[k]) - fprintf(f, "%lu,%s\n", base + k, + fprintf(f, ":%lu %s\n", base + k, sha1_to_hex(m->data.marked[k]->sha1)); } } @@ -1476,6 +1477,18 @@ static void cmd_new_commit() store_object(OBJ_COMMIT, body, sp - body, NULL, b->sha1, next_mark); free(body); b->last_commit = object_count_by_type[OBJ_COMMIT]; + + if (branch_log) { + int need_dq = quote_c_style(b->name, NULL, NULL, 0); + fprintf(branch_log, "commit "); + if (need_dq) { + fputc('"', branch_log); + quote_c_style(b->name, NULL, branch_log, 0); + fputc('"', branch_log); + } else + fprintf(branch_log, "%s", b->name); + fprintf(branch_log," :%lu %s\n",next_mark,sha1_to_hex(b->sha1)); + } } static void cmd_new_tag() @@ -1490,6 +1503,7 @@ static void cmd_new_tag() size_t msglen; char *body; struct tag *t; + unsigned long from_mark = 0; unsigned char sha1[20]; /* Obtain the new tag name from the rest of our command */ @@ -1528,10 +1542,10 @@ static void cmd_new_tag() if (s) { memcpy(sha1, s->sha1, 20); } else if (*from == ':') { - unsigned long idnum = strtoul(from + 1, NULL, 10); - struct object_entry *oe = find_mark(idnum); + from_mark = strtoul(from + 1, NULL, 10); + struct object_entry *oe = find_mark(from_mark); if (oe->type != OBJ_COMMIT) - die("Mark :%lu not a commit", idnum); + die("Mark :%lu not a commit", from_mark); memcpy(sha1, oe->sha1, 20); } else if (!get_sha1(from, sha1)) { unsigned long size; @@ -1572,10 +1586,22 @@ static void cmd_new_tag() store_object(OBJ_TAG, body, sp - body, NULL, t->sha1, 0); free(body); + + if (branch_log) { + int need_dq = quote_c_style(t->name, NULL, NULL, 0); + fprintf(branch_log, "tag "); + if (need_dq) { + fputc('"', branch_log); + quote_c_style(t->name, NULL, branch_log, 0); + fputc('"', branch_log); + } else + fprintf(branch_log, "%s", t->name); + fprintf(branch_log," :%lu %s\n",from_mark,sha1_to_hex(t->sha1)); + } } static const char fast_import_usage[] = -"git-fast-import [--objects=n] [--depth=n] [--active-branches=n] [--export-marks=marks.file] temp.pack"; +"git-fast-import [--objects=n] [--depth=n] [--active-branches=n] [--export-marks=marks.file] [--branch-log=log] temp.pack"; int main(int argc, const char **argv) { @@ -1602,6 +1628,11 @@ int main(int argc, const char **argv) max_active_branches = strtoul(a + 18, NULL, 0); else if (!strncmp(a, "--export-marks=", 15)) mark_file = a + 15; + else if (!strncmp(a, "--branch-log=", 13)) { + branch_log = fopen(a + 13, "w"); + if (!branch_log) + die("Can't create %s: %s", a + 13, strerror(errno)); + } else die("unknown option %s", a); } @@ -1647,6 +1678,7 @@ int main(int argc, const char **argv) dump_branches(); dump_tags(); dump_marks(); + fclose(branch_log); fprintf(stderr, "%s statistics:\n", argv[0]); fprintf(stderr, "---------------------------------------------------\n"); From 2eb26d8454de77f45bbbfc32eed2a6c3133fe963 Mon Sep 17 00:00:00 2001 From: "Shawn O. Pearce" Date: Sat, 26 Aug 2006 22:38:02 -0400 Subject: [PATCH 24/81] Fixed GPF in fast-import caused by unterminated linked list. fast-import was encounting a GPF when it ran out of free tree_entry objects but didn't know this was the cause because the last tree_entry wasn't terminated with a NULL pointer. The missing NULL pointer occurred when we allocated additional entries via xmalloc but didn't set the last tree_entry's "next" pointer to NULL. Signed-off-by: Shawn O. Pearce --- fast-import.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/fast-import.c b/fast-import.c index 8328e004bb..194116be6f 100644 --- a/fast-import.c +++ b/fast-import.c @@ -520,10 +520,11 @@ static struct tree_entry* new_tree_entry() unsigned int n = tree_entry_alloc; total_allocd += n * sizeof(struct tree_entry); avail_tree_entry = e = xmalloc(n * sizeof(struct tree_entry)); - while (n--) { + while (n-- > 1) { *((void**)e) = e + 1; e++; } + *((void*)e) = NULL; } e = avail_tree_entry; From 35ef237cf630418c2e45752eb527268693a2895b Mon Sep 17 00:00:00 2001 From: "Shawn O. Pearce" Date: Sat, 26 Aug 2006 23:37:31 -0400 Subject: [PATCH 25/81] Fixed compile error in fast-import. Signed-off-by: Shawn O. Pearce --- fast-import.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fast-import.c b/fast-import.c index 194116be6f..309f4d353b 100644 --- a/fast-import.c +++ b/fast-import.c @@ -524,7 +524,7 @@ static struct tree_entry* new_tree_entry() *((void**)e) = e + 1; e++; } - *((void*)e) = NULL; + *((void**)e) = NULL; } e = avail_tree_entry; From 53dbce78a2a018bd2828d3ecc4123015f88ae36f Mon Sep 17 00:00:00 2001 From: "Shawn O. Pearce" Date: Sun, 27 Aug 2006 05:53:48 -0400 Subject: [PATCH 26/81] Map only part of the generated pack file at any point in time. When generating a very large pack file (for example close to 1 GB in size) it may be impossible for the kernel to find a contiguous free range within a 32 bit address space for the mapping to be located at. This is especially problematic on large imports where there is a lot of malloc activity occuring within the same process and the malloc'd regions may straddle the previously mapped regions, thereby creating large holes in the address space. So instead we map only 128 MB of the pack at any given time. This will likely increase the number of times the file gets mapped (with additional system time required to update the page tables more frequently) but will allow the program to handle packs up to 4 GB in size. Signed-off-by: Shawn O. Pearce --- fast-import.c | 44 ++++++++++++++++++++++++-------------------- 1 file changed, 24 insertions(+), 20 deletions(-) diff --git a/fast-import.c b/fast-import.c index 309f4d353b..f3376c60ef 100644 --- a/fast-import.c +++ b/fast-import.c @@ -215,8 +215,10 @@ static struct atom_str **atom_table; static int pack_fd; static unsigned long pack_size; static unsigned char pack_sha1[20]; -static void* pack_base; -static size_t pack_mlen; +static unsigned char* pack_base; +static unsigned long pack_moff; +static unsigned long pack_mlen = 128*1024*1024; +static unsigned long page_size; /* Table of objects we've written. */ static unsigned int object_entry_alloc = 1000; @@ -676,23 +678,26 @@ static int store_object( return 0; } -static void* map_pack(unsigned long offset) +static unsigned char* map_pack(unsigned long offset, unsigned int *left) { if (offset >= pack_size) die("object offset outside of pack file"); - if (offset >= pack_mlen) { + if (!pack_base + || offset < pack_moff + || (offset + 20) >= (pack_moff + pack_mlen)) { if (pack_base) munmap(pack_base, pack_mlen); - /* round out how much we map to 16 MB units */ - pack_mlen = pack_size; - if (pack_mlen & ((1 << 24) - 1)) - pack_mlen = ((pack_mlen >> 24) + 1) << 24; - pack_base = mmap(NULL,pack_mlen,PROT_READ,MAP_SHARED,pack_fd,0); + pack_moff = (offset / page_size) * page_size; + pack_base = mmap(NULL,pack_mlen,PROT_READ,MAP_SHARED, + pack_fd,pack_moff); if (pack_base == MAP_FAILED) die("Failed to map generated pack: %s", strerror(errno)); remap_count++; } - return (char*)pack_base + offset; + offset -= pack_moff; + if (left) + *left = pack_mlen - offset; + return pack_base + offset; } static unsigned long unpack_object_header(unsigned long offset, @@ -703,12 +708,12 @@ static unsigned long unpack_object_header(unsigned long offset, unsigned char c; unsigned long size; - c = *(unsigned char*)map_pack(offset++); + c = *map_pack(offset++, NULL); *type = (c >> 4) & 7; size = c & 15; shift = 4; while (c & 0x80) { - c = *(unsigned char*)map_pack(offset++); + c = *map_pack(offset++, NULL); size += (c & 0x7f) << shift; shift += 7; } @@ -725,8 +730,7 @@ static void *unpack_non_delta_entry(unsigned long o, unsigned long sz) result[sz] = 0; memset(&stream, 0, sizeof(stream)); - stream.next_in = map_pack(o); - stream.avail_in = pack_mlen - o; + stream.next_in = map_pack(o, &stream.avail_in); stream.next_out = result; stream.avail_out = sz; @@ -735,13 +739,12 @@ static void *unpack_non_delta_entry(unsigned long o, unsigned long sz) int st = inflate(&stream, Z_FINISH); if (st == Z_STREAM_END) break; - if (st == Z_OK) { - o = stream.next_in - (unsigned char*)pack_base; - stream.next_in = map_pack(o); - stream.avail_in = pack_mlen - o; + if (st == Z_OK || st == Z_BUF_ERROR) { + o = stream.next_in - pack_base + pack_moff; + stream.next_in = map_pack(o, &stream.avail_in); continue; } - die("Error from zlib during inflate."); + die("Error %i from zlib during inflate.", st); } inflateEnd(&stream); if (stream.total_out != sz) @@ -760,7 +763,7 @@ static void *unpack_delta_entry(unsigned long offset, void *delta_data, *base, *result; unsigned long base_size, result_size; - base_sha1 = (unsigned char*)map_pack(offset + 20) - 20; + base_sha1 = map_pack(offset, NULL); base_oe = find_object(base_sha1); if (!base_oe) die("I'm broken; I can't find a base I know must be here."); @@ -1615,6 +1618,7 @@ int main(int argc, const char **argv) setup_ident(); git_config(git_default_config); + page_size = getpagesize(); for (i = 1; i < argc; i++) { const char *a = argv[i]; From 5fced8dc6f4844997b6e25a67a00f428775c5233 Mon Sep 17 00:00:00 2001 From: "Shawn O. Pearce" Date: Sun, 27 Aug 2006 06:20:49 -0400 Subject: [PATCH 27/81] Added 'reset' command to clear a branch's tree. Sometimes an import frontend may need to work with a temporary branch which will actually contain many different branches over the life of the import. This is especially useful when the frontend needs to create a tag from a set of file versions which are otherwise never a commit. Signed-off-by: Shawn O. Pearce --- fast-import.c | 32 ++++++++++++++++++++++++++++++++ 1 file changed, 32 insertions(+) diff --git a/fast-import.c b/fast-import.c index f3376c60ef..778b8bfdd4 100644 --- a/fast-import.c +++ b/fast-import.c @@ -6,6 +6,7 @@ Format of STDIN stream: cmd ::= new_blob | new_commit | new_tag + | reset_branch ; new_blob ::= 'blob' lf @@ -34,6 +35,8 @@ Format of STDIN stream: tag_msg; tag_msg ::= data; + reset_branch ::= 'reset' sp ref_str lf; + # note: the first idnum in a stream should be 1 and subsequent # idnums should not have gaps between values as this will cause # the stream parser to reserve space for the gapped values. An @@ -1604,6 +1607,33 @@ static void cmd_new_tag() } } +static void cmd_reset_branch() +{ + struct branch *b; + char *str_uq; + const char *endp; + char *sp; + + /* Obtain the branch name from the rest of our command */ + sp = strchr(command_buf.buf, ' ') + 1; + str_uq = unquote_c_style(sp, &endp); + if (str_uq) { + if (*endp) + die("Garbage after ref in: %s", command_buf.buf); + sp = str_uq; + } + b = lookup_branch(sp); + if (b) { + b->last_commit = 0; + if (b->branch_tree.tree) { + release_tree_content_recursive(b->branch_tree.tree); + b->branch_tree.tree = NULL; + } + } + if (str_uq) + free(str_uq); +} + static const char fast_import_usage[] = "git-fast-import [--objects=n] [--depth=n] [--active-branches=n] [--export-marks=marks.file] [--branch-log=log] temp.pack"; @@ -1673,6 +1703,8 @@ int main(int argc, const char **argv) cmd_new_commit(); else if (!strncmp("tag ", command_buf.buf, 4)) cmd_new_tag(); + else if (!strncmp("reset ", command_buf.buf, 6)) + cmd_reset_branch(); else die("Unsupported command: %s", command_buf.buf); } From 08d7e892a714dec8471cd45add2b1da24f66b3e7 Mon Sep 17 00:00:00 2001 From: "Shawn O. Pearce" Date: Sun, 27 Aug 2006 20:13:44 -0400 Subject: [PATCH 28/81] Don't crash fast-import if no branch log was requested. Signed-off-by: Shawn O. Pearce --- fast-import.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/fast-import.c b/fast-import.c index 778b8bfdd4..5376b5e15c 100644 --- a/fast-import.c +++ b/fast-import.c @@ -1715,7 +1715,8 @@ int main(int argc, const char **argv) dump_branches(); dump_tags(); dump_marks(); - fclose(branch_log); + if (branch_log) + fclose(branch_log); fprintf(stderr, "%s statistics:\n", argv[0]); fprintf(stderr, "---------------------------------------------------\n"); From 445b85999a309c8e5c7f928484c57325c280152e Mon Sep 17 00:00:00 2001 From: "Shawn O. Pearce" Date: Mon, 28 Aug 2006 10:46:58 -0400 Subject: [PATCH 29/81] Converted hash memcpy/memcmp to new hashcpy/hashcmp/hashclr. Signed-off-by: Shawn O. Pearce --- fast-import.c | 52 +++++++++++++++++++++++++-------------------------- 1 file changed, 26 insertions(+), 26 deletions(-) diff --git a/fast-import.c b/fast-import.c index 5376b5e15c..b1b2382560 100644 --- a/fast-import.c +++ b/fast-import.c @@ -277,7 +277,7 @@ static struct object_entry* new_object(unsigned char *sha1) alloc_objects(object_entry_alloc); e = blocks->next_free++; - memcpy(e->sha1, sha1, sizeof(e->sha1)); + hashcpy(e->sha1, sha1); return e; } @@ -286,7 +286,7 @@ static struct object_entry* find_object(unsigned char *sha1) unsigned int h = sha1[0] << 8 | sha1[1]; struct object_entry *e; for (e = object_table[h]; e; e = e->next) - if (!memcmp(sha1, e->sha1, sizeof(e->sha1))) + if (!hashcmp(sha1, e->sha1)) return e; return NULL; } @@ -298,7 +298,7 @@ static struct object_entry* insert_object(unsigned char *sha1) struct object_entry *p = NULL; while (e) { - if (!memcmp(sha1, e->sha1, sizeof(e->sha1))) + if (!hashcmp(sha1, e->sha1)) return e; p = e; e = e->next; @@ -616,7 +616,7 @@ static int store_object( SHA1_Update(&c, dat, datlen); SHA1_Final(sha1, &c); if (sha1out) - memcpy(sha1out, sha1, sizeof(sha1)); + hashcpy(sha1out, sha1); e = insert_object(sha1); if (mark) @@ -676,7 +676,7 @@ static int store_object( free(last->data); last->data = dat; last->len = datlen; - memcpy(last->sha1, sha1, sizeof(sha1)); + hashcpy(last->sha1, sha1); } return 0; } @@ -826,7 +826,7 @@ static void load_tree(struct tree_entry *root) const char *c; root->tree = t = new_tree_content(8); - if (!memcmp(root->sha1, null_sha1, 20)) + if (is_null_sha1(root->sha1)) return; myoe = find_object(root->sha1); @@ -855,7 +855,7 @@ static void load_tree(struct tree_entry *root) die("Corrupt mode in %s", sha1_to_hex(root->sha1)); e->name = to_atom(c, strlen(c)); c += e->name->str_len + 1; - memcpy(e->sha1, c, sizeof(e->sha1)); + hashcpy(e->sha1, c); c += 20; } free(buf); @@ -877,7 +877,7 @@ static void store_tree(struct tree_entry *root) size_t maxlen; char *buf, *c; - if (memcmp(root->sha1, null_sha1, 20)) + if (!is_null_sha1(root->sha1)) return; maxlen = 0; @@ -895,7 +895,7 @@ static void store_tree(struct tree_entry *root) *c++ = ' '; strcpy(c, e->name->str_dat); c += e->name->str_len + 1; - memcpy(c, e->sha1, 20); + hashcpy(c, e->sha1); c += 20; } store_object(OBJ_TREE, buf, c - buf, NULL, root->sha1, 0); @@ -923,15 +923,15 @@ static int tree_content_set( e = t->entries[i]; if (e->name->str_len == n && !strncmp(p, e->name->str_dat, n)) { if (!slash1) { - if (e->mode == mode && !memcmp(e->sha1, sha1, 20)) + if (e->mode == mode && !hashcmp(e->sha1, sha1)) return 0; e->mode = mode; - memcpy(e->sha1, sha1, 20); + hashcpy(e->sha1, sha1); if (e->tree) { release_tree_content_recursive(e->tree); e->tree = NULL; } - memcpy(root->sha1, null_sha1, 20); + hashclr(root->sha1); return 1; } if (!S_ISDIR(e->mode)) { @@ -941,7 +941,7 @@ static int tree_content_set( if (!e->tree) load_tree(e); if (tree_content_set(e, slash1 + 1, sha1, mode)) { - memcpy(root->sha1, null_sha1, 20); + hashclr(root->sha1); return 1; } return 0; @@ -960,9 +960,9 @@ static int tree_content_set( } else { e->tree = NULL; e->mode = mode; - memcpy(e->sha1, sha1, 20); + hashcpy(e->sha1, sha1); } - memcpy(root->sha1, null_sha1, 20); + hashclr(root->sha1); return 1; } @@ -989,7 +989,7 @@ static int tree_content_remove(struct tree_entry *root, const char *p) if (tree_content_remove(e, slash1 + 1)) { if (!e->tree->entry_count) goto del_entry; - memcpy(root->sha1, null_sha1, 20); + hashclr(root->sha1); return 1; } return 0; @@ -1002,7 +1002,7 @@ del_entry: t->entries[i-1] = t->entries[i]; t->entry_count--; release_tree_entry(e); - memcpy(root->sha1, null_sha1, 20); + hashclr(root->sha1); return 1; } @@ -1054,7 +1054,7 @@ static int oecmp (const void *_a, const void *_b) { struct object_entry *a = *((struct object_entry**)_a); struct object_entry *b = *((struct object_entry**)_b); - return memcmp(a->sha1, b->sha1, sizeof(a->sha1)); + return hashcmp(a->sha1, b->sha1); } static void write_index(const char *idx_name) @@ -1359,8 +1359,8 @@ static void cmd_from(struct branch *b) if (b == s) die("Can't create a branch from itself: %s", b->name); else if (s) { - memcpy(b->sha1, s->sha1, 20); - memcpy(b->branch_tree.sha1, s->branch_tree.sha1, 20); + hashcpy(b->sha1, s->sha1); + hashcpy(b->branch_tree.sha1, s->branch_tree.sha1); } else if (*from == ':') { unsigned long idnum = strtoul(from + 1, NULL, 10); struct object_entry *oe = find_mark(idnum); @@ -1368,7 +1368,7 @@ static void cmd_from(struct branch *b) char *buf; if (oe->type != OBJ_COMMIT) die("Mark :%lu not a commit", idnum); - memcpy(b->sha1, oe->sha1, 20); + hashcpy(b->sha1, oe->sha1); buf = unpack_entry(oe->offset, &size); if (!buf || size < 46) die("Not a valid commit: %s", from); @@ -1377,8 +1377,8 @@ static void cmd_from(struct branch *b) die("The commit %s is corrupt", sha1_to_hex(b->sha1)); free(buf); } else if (!get_sha1(from, b->sha1)) { - if (!memcmp(b->sha1, null_sha1, 20)) - memcpy(b->branch_tree.sha1, null_sha1, 20); + if (is_null_sha1(b->sha1)) + hashclr(b->branch_tree.sha1); else { unsigned long size; char *buf; @@ -1467,7 +1467,7 @@ static void cmd_new_commit() : 2 * strlen(committer))); sp = body; sp += sprintf(sp, "tree %s\n", sha1_to_hex(b->branch_tree.sha1)); - if (memcmp(b->sha1, null_sha1, 20)) + if (!is_null_sha1(b->sha1)) sp += sprintf(sp, "parent %s\n", sha1_to_hex(b->sha1)); if (author) sp += sprintf(sp, "%s\n", author); @@ -1547,13 +1547,13 @@ static void cmd_new_tag() s = lookup_branch(from); if (s) { - memcpy(sha1, s->sha1, 20); + hashcpy(sha1, s->sha1); } else if (*from == ':') { from_mark = strtoul(from + 1, NULL, 10); struct object_entry *oe = find_mark(from_mark); if (oe->type != OBJ_COMMIT) die("Mark :%lu not a commit", from_mark); - memcpy(sha1, oe->sha1, 20); + hashcpy(sha1, oe->sha1); } else if (!get_sha1(from, sha1)) { unsigned long size; char *buf; From 4cabf8583f934260697a065186f3dce135834ede Mon Sep 17 00:00:00 2001 From: "Shawn O. Pearce" Date: Mon, 28 Aug 2006 12:22:50 -0400 Subject: [PATCH 30/81] Implemented tree delta compression in fast-import. We now store for every tree entry two modes and two sha1 values; the base (aka "version 0") and the current/new (aka "version 1"). When we generate a tree object we also regenerate the prior version object and use that as our base object for a delta. This strategy saves a significant amount of memory as we can continue to use the atom pool for file/directory names and only increases each tree entry by an additional 24 bytes of memory. Branches should automatically delta against their ancestor tree, unless the ancestor tree is already at the delta chain limit. Signed-off-by: Shawn O. Pearce --- fast-import.c | 228 +++++++++++++++++++++++++++++++++++--------------- 1 file changed, 159 insertions(+), 69 deletions(-) diff --git a/fast-import.c b/fast-import.c index b1b2382560..6b01120415 100644 --- a/fast-import.c +++ b/fast-import.c @@ -132,7 +132,7 @@ struct mark_set struct last_object { void *data; - unsigned int len; + unsigned long len; unsigned int depth; unsigned char sha1[20]; }; @@ -157,14 +157,18 @@ struct tree_entry { struct tree_content *tree; struct atom_str* name; - unsigned int mode; - unsigned char sha1[20]; + struct tree_entry_ms + { + unsigned int mode; + unsigned char sha1[20]; + } versions[2]; }; struct tree_content { unsigned int entry_capacity; /* must match avail_tree_content */ unsigned int entry_count; + unsigned int delta_depth; struct tree_entry *entries[FLEX_ARRAY]; /* more */ }; @@ -203,6 +207,7 @@ static unsigned long duplicate_count; static unsigned long marks_set_count; static unsigned long object_count_by_type[9]; static unsigned long duplicate_count_by_type[9]; +static unsigned long delta_count_by_type[9]; /* Memory pools */ static size_t mem_pool_alloc = 2*1024*1024 - sizeof(struct mem_pool); @@ -224,7 +229,7 @@ static unsigned long pack_mlen = 128*1024*1024; static unsigned long page_size; /* Table of objects we've written. */ -static unsigned int object_entry_alloc = 1000; +static unsigned int object_entry_alloc = 5000; static struct object_entry_pool *blocks; static struct object_entry *object_table[1 << 16]; static struct mark_set *marks; @@ -486,6 +491,7 @@ static struct tree_content* new_tree_content(unsigned int cnt) t = (struct tree_content*)f; t->entry_count = 0; + t->delta_depth = 0; return t; } @@ -512,6 +518,7 @@ static struct tree_content* grow_tree_content( { struct tree_content *r = new_tree_content(t->entry_count + amt); r->entry_count = t->entry_count; + r->delta_depth = t->delta_depth; memcpy(r->entries,t->entries,t->entry_count*sizeof(t->entries[0])); release_tree_content(t); return r; @@ -642,6 +649,7 @@ static int store_object( deflateInit(&s, zlib_compression_level); if (delta) { + delta_count_by_type[type]++; last->depth++; s.next_in = delta; s.avail_in = deltalen; @@ -755,11 +763,14 @@ static void *unpack_non_delta_entry(unsigned long o, unsigned long sz) return result; } -static void *unpack_entry(unsigned long offset, unsigned long *sizep); +static void *unpack_entry(unsigned long offset, + unsigned long *sizep, + unsigned int *delta_depth); static void *unpack_delta_entry(unsigned long offset, unsigned long delta_size, - unsigned long *sizep) + unsigned long *sizep, + unsigned int *delta_depth) { struct object_entry *base_oe; unsigned char *base_sha1; @@ -770,7 +781,7 @@ static void *unpack_delta_entry(unsigned long offset, base_oe = find_object(base_sha1); if (!base_oe) die("I'm broken; I can't find a base I know must be here."); - base = unpack_entry(base_oe->offset, &base_size); + base = unpack_entry(base_oe->offset, &base_size, delta_depth); delta_data = unpack_non_delta_entry(offset + 20, delta_size); result = patch_delta(base, base_size, delta_data, delta_size, @@ -780,10 +791,13 @@ static void *unpack_delta_entry(unsigned long offset, free(delta_data); free(base); *sizep = result_size; + (*delta_depth)++; return result; } -static void *unpack_entry(unsigned long offset, unsigned long *sizep) +static void *unpack_entry(unsigned long offset, + unsigned long *sizep, + unsigned int *delta_depth) { unsigned long size; enum object_type kind; @@ -791,12 +805,13 @@ static void *unpack_entry(unsigned long offset, unsigned long *sizep) offset = unpack_object_header(offset, &kind, &size); switch (kind) { case OBJ_DELTA: - return unpack_delta_entry(offset, size, sizep); + return unpack_delta_entry(offset, size, sizep, delta_depth); case OBJ_COMMIT: case OBJ_TREE: case OBJ_BLOB: case OBJ_TAG: *sizep = size; + *delta_depth = 0; return unpack_non_delta_entry(offset, size); default: die("I created an object I can't read!"); @@ -819,6 +834,7 @@ static const char *get_mode(const char *str, unsigned int *modep) static void load_tree(struct tree_entry *root) { + unsigned char* sha1 = root->versions[1].sha1; struct object_entry *myoe; struct tree_content *t; unsigned long size; @@ -826,19 +842,19 @@ static void load_tree(struct tree_entry *root) const char *c; root->tree = t = new_tree_content(8); - if (is_null_sha1(root->sha1)) + if (is_null_sha1(sha1)) return; - myoe = find_object(root->sha1); + myoe = find_object(sha1); if (myoe) { if (myoe->type != OBJ_TREE) - die("Not a tree: %s", sha1_to_hex(root->sha1)); - buf = unpack_entry(myoe->offset, &size); + die("Not a tree: %s", sha1_to_hex(sha1)); + buf = unpack_entry(myoe->offset, &size, &t->delta_depth); } else { char type[20]; - buf = read_sha1_file(root->sha1, type, &size); + buf = read_sha1_file(sha1, type, &size); if (!buf || strcmp(type, tree_type)) - die("Can't load tree %s", sha1_to_hex(root->sha1)); + die("Can't load tree %s", sha1_to_hex(sha1)); } c = buf; @@ -850,56 +866,116 @@ static void load_tree(struct tree_entry *root) t->entries[t->entry_count++] = e; e->tree = NULL; - c = get_mode(c, &e->mode); + c = get_mode(c, &e->versions[1].mode); if (!c) - die("Corrupt mode in %s", sha1_to_hex(root->sha1)); + die("Corrupt mode in %s", sha1_to_hex(sha1)); + e->versions[0].mode = e->versions[1].mode; e->name = to_atom(c, strlen(c)); c += e->name->str_len + 1; - hashcpy(e->sha1, c); + hashcpy(e->versions[0].sha1, (unsigned char*)c); + hashcpy(e->versions[1].sha1, (unsigned char*)c); c += 20; } free(buf); } -static int tecmp (const void *_a, const void *_b) +static int tecmp0 (const void *_a, const void *_b) { struct tree_entry *a = *((struct tree_entry**)_a); struct tree_entry *b = *((struct tree_entry**)_b); return base_name_compare( - a->name->str_dat, a->name->str_len, a->mode, - b->name->str_dat, b->name->str_len, b->mode); + a->name->str_dat, a->name->str_len, a->versions[0].mode, + b->name->str_dat, b->name->str_len, b->versions[0].mode); +} + +static int tecmp1 (const void *_a, const void *_b) +{ + struct tree_entry *a = *((struct tree_entry**)_a); + struct tree_entry *b = *((struct tree_entry**)_b); + return base_name_compare( + a->name->str_dat, a->name->str_len, a->versions[1].mode, + b->name->str_dat, b->name->str_len, b->versions[1].mode); +} + +static void* mktree(struct tree_content *t, int v, unsigned long *szp) +{ + size_t maxlen = 0; + unsigned int i; + char *buf, *c; + + if (!v) + qsort(t->entries,t->entry_count,sizeof(t->entries[0]),tecmp0); + else + qsort(t->entries,t->entry_count,sizeof(t->entries[0]),tecmp1); + + for (i = 0; i < t->entry_count; i++) { + if (t->entries[i]->versions[v].mode) + maxlen += t->entries[i]->name->str_len + 34; + } + + buf = c = xmalloc(maxlen); + for (i = 0; i < t->entry_count; i++) { + struct tree_entry *e = t->entries[i]; + if (!e->versions[v].mode) + continue; + c += sprintf(c, "%o", e->versions[v].mode); + *c++ = ' '; + strcpy(c, e->name->str_dat); + c += e->name->str_len + 1; + hashcpy((unsigned char*)c, e->versions[v].sha1); + c += 20; + } + + *szp = c - buf; + return buf; } static void store_tree(struct tree_entry *root) { struct tree_content *t = root->tree; - unsigned int i; - size_t maxlen; - char *buf, *c; + unsigned int i, j, del; + unsigned long vers1len; + void **vers1dat; + struct last_object lo; - if (!is_null_sha1(root->sha1)) + if (!is_null_sha1(root->versions[1].sha1)) return; - maxlen = 0; for (i = 0; i < t->entry_count; i++) { - maxlen += t->entries[i]->name->str_len + 34; if (t->entries[i]->tree) store_tree(t->entries[i]); } - qsort(t->entries, t->entry_count, sizeof(t->entries[0]), tecmp); - buf = c = xmalloc(maxlen); - for (i = 0; i < t->entry_count; i++) { - struct tree_entry *e = t->entries[i]; - c += sprintf(c, "%o", e->mode); - *c++ = ' '; - strcpy(c, e->name->str_dat); - c += e->name->str_len + 1; - hashcpy(c, e->sha1); - c += 20; + if (is_null_sha1(root->versions[0].sha1) + || !find_object(root->versions[0].sha1)) { + lo.data = NULL; + lo.depth = 0; + } else { + lo.data = mktree(t, 0, &lo.len); + lo.depth = t->delta_depth; + hashcpy(lo.sha1, root->versions[0].sha1); } - store_object(OBJ_TREE, buf, c - buf, NULL, root->sha1, 0); - free(buf); + vers1dat = mktree(t, 1, &vers1len); + + store_object(OBJ_TREE, vers1dat, vers1len, + &lo, root->versions[1].sha1, 0); + /* note: lo.dat (if created) was freed by store_object */ + free(vers1dat); + + t->delta_depth = lo.depth; + hashcpy(root->versions[0].sha1, root->versions[1].sha1); + for (i = 0, j = 0, del = 0; i < t->entry_count; i++) { + struct tree_entry *e = t->entries[i]; + if (e->versions[1].mode) { + e->versions[0].mode = e->versions[1].mode; + hashcpy(e->versions[0].sha1, e->versions[1].sha1); + t->entries[j++] = e; + } else { + release_tree_entry(e); + del++; + } + } + t->entry_count -= del; } static int tree_content_set( @@ -923,25 +999,26 @@ static int tree_content_set( e = t->entries[i]; if (e->name->str_len == n && !strncmp(p, e->name->str_dat, n)) { if (!slash1) { - if (e->mode == mode && !hashcmp(e->sha1, sha1)) + if (e->versions[1].mode == mode + && !hashcmp(e->versions[1].sha1, sha1)) return 0; - e->mode = mode; - hashcpy(e->sha1, sha1); + e->versions[1].mode = mode; + hashcpy(e->versions[1].sha1, sha1); if (e->tree) { release_tree_content_recursive(e->tree); e->tree = NULL; } - hashclr(root->sha1); + hashclr(root->versions[1].sha1); return 1; } - if (!S_ISDIR(e->mode)) { + if (!S_ISDIR(e->versions[1].mode)) { e->tree = new_tree_content(8); - e->mode = S_IFDIR; + e->versions[1].mode = S_IFDIR; } if (!e->tree) load_tree(e); if (tree_content_set(e, slash1 + 1, sha1, mode)) { - hashclr(root->sha1); + hashclr(root->versions[1].sha1); return 1; } return 0; @@ -952,17 +1029,19 @@ static int tree_content_set( root->tree = t = grow_tree_content(t, 8); e = new_tree_entry(); e->name = to_atom(p, n); + e->versions[0].mode = 0; + hashclr(e->versions[0].sha1); t->entries[t->entry_count++] = e; if (slash1) { e->tree = new_tree_content(8); - e->mode = S_IFDIR; + e->versions[1].mode = S_IFDIR; tree_content_set(e, slash1 + 1, sha1, mode); } else { e->tree = NULL; - e->mode = mode; - hashcpy(e->sha1, sha1); + e->versions[1].mode = mode; + hashcpy(e->versions[1].sha1, sha1); } - hashclr(root->sha1); + hashclr(root->versions[1].sha1); return 1; } @@ -982,14 +1061,14 @@ static int tree_content_remove(struct tree_entry *root, const char *p) for (i = 0; i < t->entry_count; i++) { e = t->entries[i]; if (e->name->str_len == n && !strncmp(p, e->name->str_dat, n)) { - if (!slash1 || !S_ISDIR(e->mode)) + if (!slash1 || !S_ISDIR(e->versions[1].mode)) goto del_entry; if (!e->tree) load_tree(e); if (tree_content_remove(e, slash1 + 1)) { if (!e->tree->entry_count) goto del_entry; - hashclr(root->sha1); + hashclr(root->versions[1].sha1); return 1; } return 0; @@ -998,11 +1077,13 @@ static int tree_content_remove(struct tree_entry *root, const char *p) return 0; del_entry: - for (i++; i < t->entry_count; i++) - t->entries[i-1] = t->entries[i]; - t->entry_count--; - release_tree_entry(e); - hashclr(root->sha1); + if (e->tree) { + release_tree_content_recursive(e->tree); + e->tree = NULL; + } + e->versions[1].mode = 0; + hashclr(e->versions[1].sha1); + hashclr(root->versions[1].sha1); return 1; } @@ -1359,27 +1440,33 @@ static void cmd_from(struct branch *b) if (b == s) die("Can't create a branch from itself: %s", b->name); else if (s) { + unsigned char *t = s->branch_tree.versions[1].sha1; hashcpy(b->sha1, s->sha1); - hashcpy(b->branch_tree.sha1, s->branch_tree.sha1); + hashcpy(b->branch_tree.versions[0].sha1, t); + hashcpy(b->branch_tree.versions[1].sha1, t); } else if (*from == ':') { unsigned long idnum = strtoul(from + 1, NULL, 10); struct object_entry *oe = find_mark(idnum); unsigned long size; + unsigned int depth; char *buf; if (oe->type != OBJ_COMMIT) die("Mark :%lu not a commit", idnum); hashcpy(b->sha1, oe->sha1); - buf = unpack_entry(oe->offset, &size); + buf = unpack_entry(oe->offset, &size, &depth); if (!buf || size < 46) die("Not a valid commit: %s", from); if (memcmp("tree ", buf, 5) - || get_sha1_hex(buf + 5, b->branch_tree.sha1)) + || get_sha1_hex(buf + 5, b->branch_tree.versions[1].sha1)) die("The commit %s is corrupt", sha1_to_hex(b->sha1)); free(buf); + hashcpy(b->branch_tree.versions[0].sha1, + b->branch_tree.versions[1].sha1); } else if (!get_sha1(from, b->sha1)) { - if (is_null_sha1(b->sha1)) - hashclr(b->branch_tree.sha1); - else { + if (is_null_sha1(b->sha1)) { + hashclr(b->branch_tree.versions[0].sha1); + hashclr(b->branch_tree.versions[1].sha1); + } else { unsigned long size; char *buf; @@ -1388,9 +1475,11 @@ static void cmd_from(struct branch *b) if (!buf || size < 46) die("Not a valid commit: %s", from); if (memcmp("tree ", buf, 5) - || get_sha1_hex(buf + 5, b->branch_tree.sha1)) + || get_sha1_hex(buf + 5, b->branch_tree.versions[1].sha1)) die("The commit %s is corrupt", sha1_to_hex(b->sha1)); free(buf); + hashcpy(b->branch_tree.versions[0].sha1, + b->branch_tree.versions[1].sha1); } } else die("Invalid ref name or SHA1 expression: %s", from); @@ -1466,7 +1555,8 @@ static void cmd_new_commit() ? strlen(author) + strlen(committer) : 2 * strlen(committer))); sp = body; - sp += sprintf(sp, "tree %s\n", sha1_to_hex(b->branch_tree.sha1)); + sp += sprintf(sp, "tree %s\n", + sha1_to_hex(b->branch_tree.versions[1].sha1)); if (!is_null_sha1(b->sha1)) sp += sprintf(sp, "parent %s\n", sha1_to_hex(b->sha1)); if (author) @@ -1722,10 +1812,10 @@ int main(int argc, const char **argv) fprintf(stderr, "---------------------------------------------------\n"); fprintf(stderr, "Alloc'd objects: %10lu (%10lu overflow )\n", alloc_count, alloc_count - est_obj_cnt); fprintf(stderr, "Total objects: %10lu (%10lu duplicates)\n", object_count, duplicate_count); - fprintf(stderr, " blobs : %10lu (%10lu duplicates)\n", object_count_by_type[OBJ_BLOB], duplicate_count_by_type[OBJ_BLOB]); - fprintf(stderr, " trees : %10lu (%10lu duplicates)\n", object_count_by_type[OBJ_TREE], duplicate_count_by_type[OBJ_TREE]); - fprintf(stderr, " commits: %10lu (%10lu duplicates)\n", object_count_by_type[OBJ_COMMIT], duplicate_count_by_type[OBJ_COMMIT]); - fprintf(stderr, " tags : %10lu (%10lu duplicates)\n", object_count_by_type[OBJ_TAG], duplicate_count_by_type[OBJ_TAG]); + fprintf(stderr, " blobs : %10lu (%10lu duplicates %10lu deltas)\n", object_count_by_type[OBJ_BLOB], duplicate_count_by_type[OBJ_BLOB], delta_count_by_type[OBJ_BLOB]); + fprintf(stderr, " trees : %10lu (%10lu duplicates %10lu deltas)\n", object_count_by_type[OBJ_TREE], duplicate_count_by_type[OBJ_TREE], delta_count_by_type[OBJ_TREE]); + fprintf(stderr, " commits: %10lu (%10lu duplicates %10lu deltas)\n", object_count_by_type[OBJ_COMMIT], duplicate_count_by_type[OBJ_COMMIT], delta_count_by_type[OBJ_COMMIT]); + fprintf(stderr, " tags : %10lu (%10lu duplicates %10lu deltas)\n", object_count_by_type[OBJ_TAG], duplicate_count_by_type[OBJ_TAG], delta_count_by_type[OBJ_TAG]); fprintf(stderr, "Total branches: %10lu (%10lu loads )\n", branch_count, branch_load_count); fprintf(stderr, " marks: %10u (%10lu unique )\n", (1 << marks->shift) * 1024, marks_set_count); fprintf(stderr, " atoms: %10u\n", atom_cnt); From e2eb469d1ff9595882c8329ad415b1d7246769d0 Mon Sep 17 00:00:00 2001 From: "Shawn O. Pearce" Date: Mon, 28 Aug 2006 13:02:51 -0400 Subject: [PATCH 31/81] Recycle data buffers for tree generation in fast-import. We only ever generate at most two tree streams at a time. Since most trees are around the same size we can simply recycle the buffers from one tree generation to the next rather than constantly xmalloc'ing and free'ing them. This should perform slightly better when handling a large number of trees as malloc has less work to do. Signed-off-by: Shawn O. Pearce --- fast-import.c | 46 ++++++++++++++++++++++++++++++++-------------- 1 file changed, 32 insertions(+), 14 deletions(-) diff --git a/fast-import.c b/fast-import.c index 6b01120415..8d15a05739 100644 --- a/fast-import.c +++ b/fast-import.c @@ -134,6 +134,7 @@ struct last_object void *data; unsigned long len; unsigned int depth; + int no_free; unsigned char sha1[20]; }; @@ -195,6 +196,12 @@ struct tag unsigned char sha1[20]; }; +struct dbuf +{ + void *buffer; + size_t capacity; +}; + /* Stats and misc. counters */ static unsigned long max_depth = 10; @@ -243,6 +250,8 @@ static unsigned int tree_entry_alloc = 1000; static void *avail_tree_entry; static unsigned int avail_tree_table_sz = 100; static struct avail_tree_content **avail_tree_table; +static struct dbuf old_tree; +static struct dbuf new_tree; /* Branch data */ static unsigned long max_active_branches = 5; @@ -680,7 +689,7 @@ static int store_object( if (delta) free(delta); if (last) { - if (last->data) + if (last->data && !last->no_free) free(last->data); last->data = dat; last->len = datlen; @@ -897,11 +906,14 @@ static int tecmp1 (const void *_a, const void *_b) b->name->str_dat, b->name->str_len, b->versions[1].mode); } -static void* mktree(struct tree_content *t, int v, unsigned long *szp) +static void mktree(struct tree_content *t, + int v, + unsigned long *szp, + struct dbuf *b) { size_t maxlen = 0; unsigned int i; - char *buf, *c; + char *c; if (!v) qsort(t->entries,t->entry_count,sizeof(t->entries[0]),tecmp0); @@ -913,7 +925,16 @@ static void* mktree(struct tree_content *t, int v, unsigned long *szp) maxlen += t->entries[i]->name->str_len + 34; } - buf = c = xmalloc(maxlen); + if (b->buffer) { + if (b->capacity < maxlen) + b->capacity = ((maxlen / 1024) + 1) * 1024; + b->buffer = xrealloc(b->buffer, b->capacity); + } else { + b->capacity = ((maxlen / 1024) + 1) * 1024; + b->buffer = xmalloc(b->capacity); + } + + c = b->buffer; for (i = 0; i < t->entry_count; i++) { struct tree_entry *e = t->entries[i]; if (!e->versions[v].mode) @@ -925,17 +946,14 @@ static void* mktree(struct tree_content *t, int v, unsigned long *szp) hashcpy((unsigned char*)c, e->versions[v].sha1); c += 20; } - - *szp = c - buf; - return buf; + *szp = c - (char*)b->buffer; } static void store_tree(struct tree_entry *root) { struct tree_content *t = root->tree; unsigned int i, j, del; - unsigned long vers1len; - void **vers1dat; + unsigned long new_len; struct last_object lo; if (!is_null_sha1(root->versions[1].sha1)) @@ -951,16 +969,16 @@ static void store_tree(struct tree_entry *root) lo.data = NULL; lo.depth = 0; } else { - lo.data = mktree(t, 0, &lo.len); + mktree(t, 0, &lo.len, &old_tree); + lo.data = old_tree.buffer; lo.depth = t->delta_depth; + lo.no_free = 1; hashcpy(lo.sha1, root->versions[0].sha1); } - vers1dat = mktree(t, 1, &vers1len); + mktree(t, 1, &new_len, &new_tree); - store_object(OBJ_TREE, vers1dat, vers1len, + store_object(OBJ_TREE, new_tree.buffer, new_len, &lo, root->versions[1].sha1, 0); - /* note: lo.dat (if created) was freed by store_object */ - free(vers1dat); t->delta_depth = lo.depth; hashcpy(root->versions[0].sha1, root->versions[1].sha1); From 243f801d1d08753cd4eff2a23e245f7575c37ad5 Mon Sep 17 00:00:00 2001 From: "Shawn O. Pearce" Date: Mon, 28 Aug 2006 13:15:48 -0400 Subject: [PATCH 32/81] Reuse the same buffer for all commits/tags in fast-import. Since most commits and tag objects are around the same size and we only generate one at a time we can reuse the same buffer rather than xmalloc'ing and free'ing the buffer every time we generate a commit. Signed-off-by: Shawn O. Pearce --- fast-import.c | 43 +++++++++++++++++++++++-------------------- 1 file changed, 23 insertions(+), 20 deletions(-) diff --git a/fast-import.c b/fast-import.c index 8d15a05739..3d99102005 100644 --- a/fast-import.c +++ b/fast-import.c @@ -267,6 +267,7 @@ static struct tag *last_tag; /* Input stream parsing */ static struct strbuf command_buf; static unsigned long next_mark; +static struct dbuf new_data; static FILE* branch_log; @@ -381,6 +382,17 @@ static char* pool_strdup(const char *s) return r; } +static void size_dbuf(struct dbuf *b, size_t maxlen) +{ + if (b->buffer) { + if (b->capacity >= maxlen) + return; + free(b->buffer); + } + b->capacity = ((maxlen / 1024) + 1) * 1024; + b->buffer = xmalloc(b->capacity); +} + static void insert_mark(unsigned long idnum, struct object_entry *oe) { struct mark_set *s = marks; @@ -925,15 +937,7 @@ static void mktree(struct tree_content *t, maxlen += t->entries[i]->name->str_len + 34; } - if (b->buffer) { - if (b->capacity < maxlen) - b->capacity = ((maxlen / 1024) + 1) * 1024; - b->buffer = xrealloc(b->buffer, b->capacity); - } else { - b->capacity = ((maxlen / 1024) + 1) * 1024; - b->buffer = xmalloc(b->capacity); - } - + size_dbuf(b, maxlen); c = b->buffer; for (i = 0; i < t->entry_count; i++) { struct tree_entry *e = t->entries[i]; @@ -1515,7 +1519,6 @@ static void cmd_new_commit() char *sp; char *author = NULL; char *committer = NULL; - char *body; /* Obtain the branch name from the rest of our command */ sp = strchr(command_buf.buf, ' ') + 1; @@ -1568,11 +1571,11 @@ static void cmd_new_commit() /* build the tree and the commit */ store_tree(&b->branch_tree); - body = xmalloc(97 + msglen + size_dbuf(&new_data, 97 + msglen + (author ? strlen(author) + strlen(committer) : 2 * strlen(committer))); - sp = body; + sp = new_data.buffer; sp += sprintf(sp, "tree %s\n", sha1_to_hex(b->branch_tree.versions[1].sha1)); if (!is_null_sha1(b->sha1)) @@ -1589,8 +1592,9 @@ static void cmd_new_commit() free(committer); free(msg); - store_object(OBJ_COMMIT, body, sp - body, NULL, b->sha1, next_mark); - free(body); + store_object(OBJ_COMMIT, + new_data.buffer, sp - (char*)new_data.buffer, + NULL, b->sha1, next_mark); b->last_commit = object_count_by_type[OBJ_COMMIT]; if (branch_log) { @@ -1616,7 +1620,6 @@ static void cmd_new_tag() struct branch *s; void *msg; size_t msglen; - char *body; struct tag *t; unsigned long from_mark = 0; unsigned char sha1[20]; @@ -1688,8 +1691,8 @@ static void cmd_new_tag() msg = cmd_data(&msglen); /* build the tag object */ - body = xmalloc(67 + strlen(t->name) + strlen(tagger) + msglen); - sp = body; + size_dbuf(&new_data, 67+strlen(t->name)+strlen(tagger)+msglen); + sp = new_data.buffer; sp += sprintf(sp, "object %s\n", sha1_to_hex(sha1)); sp += sprintf(sp, "type %s\n", type_names[OBJ_COMMIT]); sp += sprintf(sp, "tag %s\n", t->name); @@ -1699,8 +1702,8 @@ static void cmd_new_tag() free(tagger); free(msg); - store_object(OBJ_TAG, body, sp - body, NULL, t->sha1, 0); - free(body); + store_object(OBJ_TAG, new_data.buffer, sp - (char*)new_data.buffer, + NULL, t->sha1, 0); if (branch_log) { int need_dq = quote_c_style(t->name, NULL, NULL, 0); @@ -1749,7 +1752,7 @@ int main(int argc, const char **argv) { const char *base_name; int i; - unsigned long est_obj_cnt = 1000; + unsigned long est_obj_cnt = object_entry_alloc; char *pack_name; char *idx_name; struct stat sb; From 23bc886c966b4362555b61f33c6eef71552e4d0e Mon Sep 17 00:00:00 2001 From: "Shawn O. Pearce" Date: Mon, 28 Aug 2006 13:54:01 -0400 Subject: [PATCH 33/81] Replace ywrite in fast-import with the standard write_or_die. Signed-off-by: Shawn O. Pearce --- fast-import.c | 27 +++++++-------------------- 1 file changed, 7 insertions(+), 20 deletions(-) diff --git a/fast-import.c b/fast-import.c index 3d99102005..f94f307ee6 100644 --- a/fast-import.c +++ b/fast-import.c @@ -586,19 +586,6 @@ static void yread(int fd, void *buffer, size_t length) } } -static void ywrite(int fd, void *buffer, size_t length) -{ - ssize_t ret = 0; - while (ret < length) { - ssize_t size = xwrite(fd, (char *) buffer + ret, length - ret); - if (!size) - die("Write to descriptor %i: end of file", fd); - if (size < 0) - die("Write to descriptor %i: %s", fd, strerror(errno)); - ret += size; - } -} - static size_t encode_header( enum object_type type, size_t size, @@ -675,8 +662,8 @@ static int store_object( s.next_in = delta; s.avail_in = deltalen; hdrlen = encode_header(OBJ_DELTA, deltalen, hdr); - ywrite(pack_fd, hdr, hdrlen); - ywrite(pack_fd, last->sha1, sizeof(sha1)); + write_or_die(pack_fd, hdr, hdrlen); + write_or_die(pack_fd, last->sha1, sizeof(sha1)); pack_size += hdrlen + sizeof(sha1); } else { if (last) @@ -684,7 +671,7 @@ static int store_object( s.next_in = dat; s.avail_in = datlen; hdrlen = encode_header(type, datlen, hdr); - ywrite(pack_fd, hdr, hdrlen); + write_or_die(pack_fd, hdr, hdrlen); pack_size += hdrlen; } @@ -694,7 +681,7 @@ static int store_object( /* nothing */; deflateEnd(&s); - ywrite(pack_fd, out, s.total_out); + write_or_die(pack_fd, out, s.total_out); pack_size += s.total_out; free(out); @@ -1117,7 +1104,7 @@ static void init_pack_header() hdr.hdr_version = htonl(2); hdr.hdr_entries = 0; - ywrite(pack_fd, &hdr, sizeof(hdr)); + write_or_die(pack_fd, &hdr, sizeof(hdr)); pack_size = sizeof(hdr); } @@ -1138,7 +1125,7 @@ static void fixup_header_footer() cnt = htonl(object_count); SHA1_Update(&c, &cnt, 4); - ywrite(pack_fd, &cnt, 4); + write_or_die(pack_fd, &cnt, 4); buf = xmalloc(128 * 1024); for (;;) { @@ -1150,7 +1137,7 @@ static void fixup_header_footer() free(buf); SHA1_Final(pack_sha1, &c); - ywrite(pack_fd, pack_sha1, sizeof(pack_sha1)); + write_or_die(pack_fd, pack_sha1, sizeof(pack_sha1)); } static int oecmp (const void *_a, const void *_b) From b54d6422b1a277ee905819e01020f5690196a999 Mon Sep 17 00:00:00 2001 From: "Shawn O. Pearce" Date: Mon, 28 Aug 2006 21:43:04 -0400 Subject: [PATCH 34/81] Correct tree corruption problems in fast-import. The new tree delta implementation caused blob SHA1s to be used instead of a tree SHA1 when a tree was written out. This really only appeared to happen when converting an existing file to a tree, but may have been possible in some other situations. Signed-off-by: Shawn O. Pearce --- fast-import.c | 16 +++++++++++----- 1 file changed, 11 insertions(+), 5 deletions(-) diff --git a/fast-import.c b/fast-import.c index f94f307ee6..34ff946fa3 100644 --- a/fast-import.c +++ b/fast-import.c @@ -956,7 +956,8 @@ static void store_tree(struct tree_entry *root) } if (is_null_sha1(root->versions[0].sha1) - || !find_object(root->versions[0].sha1)) { + || !find_object(root->versions[0].sha1) + || !S_ISDIR(root->versions[0].mode)) { lo.data = NULL; lo.depth = 0; } else { @@ -1023,6 +1024,7 @@ static int tree_content_set( if (!S_ISDIR(e->versions[1].mode)) { e->tree = new_tree_content(8); e->versions[1].mode = S_IFDIR; + hashclr(e->versions[1].sha1); } if (!e->tree) load_tree(e); @@ -1044,6 +1046,7 @@ static int tree_content_set( if (slash1) { e->tree = new_tree_content(8); e->versions[1].mode = S_IFDIR; + hashclr(e->versions[1].sha1); tree_content_set(e, slash1 + 1, sha1, mode); } else { e->tree = NULL; @@ -1075,10 +1078,13 @@ static int tree_content_remove(struct tree_entry *root, const char *p) if (!e->tree) load_tree(e); if (tree_content_remove(e, slash1 + 1)) { - if (!e->tree->entry_count) - goto del_entry; - hashclr(root->versions[1].sha1); - return 1; + for (n = 0; n < e->tree->entry_count; n++) { + if (e->tree->entries[n]->versions[1].mode) { + hashclr(root->versions[1].sha1); + return 1; + } + } + goto del_entry; } return 0; } From 8a8c55ea709d26ca397d6588e85579339885f507 Mon Sep 17 00:00:00 2001 From: "Shawn O. Pearce" Date: Mon, 28 Aug 2006 22:06:13 -0400 Subject: [PATCH 35/81] Additional fast-import tree delta corruption cleanups. Signed-off-by: Shawn O. Pearce --- fast-import.c | 22 +++++++++++----------- 1 file changed, 11 insertions(+), 11 deletions(-) diff --git a/fast-import.c b/fast-import.c index 34ff946fa3..e35a89f6cd 100644 --- a/fast-import.c +++ b/fast-import.c @@ -478,6 +478,8 @@ static struct branch* new_branch(const char *name) b = pool_calloc(1, sizeof(struct branch)); b->name = pool_strdup(name); b->table_next_branch = branch_table[hc]; + b->branch_tree.versions[0].mode = S_IFDIR; + b->branch_tree.versions[1].mode = S_IFDIR; branch_table[hc] = b; branch_count++; return b; @@ -955,9 +957,9 @@ static void store_tree(struct tree_entry *root) store_tree(t->entries[i]); } - if (is_null_sha1(root->versions[0].sha1) - || !find_object(root->versions[0].sha1) - || !S_ISDIR(root->versions[0].mode)) { + if (!S_ISDIR(root->versions[0].mode) + || is_null_sha1(root->versions[0].sha1) + || !find_object(root->versions[0].sha1)) { lo.data = NULL; lo.depth = 0; } else { @@ -967,13 +969,12 @@ static void store_tree(struct tree_entry *root) lo.no_free = 1; hashcpy(lo.sha1, root->versions[0].sha1); } - mktree(t, 1, &new_len, &new_tree); + mktree(t, 1, &new_len, &new_tree); store_object(OBJ_TREE, new_tree.buffer, new_len, &lo, root->versions[1].sha1, 0); t->delta_depth = lo.depth; - hashcpy(root->versions[0].sha1, root->versions[1].sha1); for (i = 0, j = 0, del = 0; i < t->entry_count; i++) { struct tree_entry *e = t->entries[i]; if (e->versions[1].mode) { @@ -1024,7 +1025,6 @@ static int tree_content_set( if (!S_ISDIR(e->versions[1].mode)) { e->tree = new_tree_content(8); e->versions[1].mode = S_IFDIR; - hashclr(e->versions[1].sha1); } if (!e->tree) load_tree(e); @@ -1046,7 +1046,6 @@ static int tree_content_set( if (slash1) { e->tree = new_tree_content(8); e->versions[1].mode = S_IFDIR; - hashclr(e->versions[1].sha1); tree_content_set(e, slash1 + 1, sha1, mode); } else { e->tree = NULL; @@ -1564,6 +1563,8 @@ static void cmd_new_commit() /* build the tree and the commit */ store_tree(&b->branch_tree); + hashcpy(b->branch_tree.versions[0].sha1, + b->branch_tree.versions[1].sha1); size_dbuf(&new_data, 97 + msglen + (author ? strlen(author) + strlen(committer) @@ -1823,9 +1824,9 @@ int main(int argc, const char **argv) fclose(branch_log); fprintf(stderr, "%s statistics:\n", argv[0]); - fprintf(stderr, "---------------------------------------------------\n"); + fprintf(stderr, "---------------------------------------------------------------------\n"); fprintf(stderr, "Alloc'd objects: %10lu (%10lu overflow )\n", alloc_count, alloc_count - est_obj_cnt); - fprintf(stderr, "Total objects: %10lu (%10lu duplicates)\n", object_count, duplicate_count); + fprintf(stderr, "Total objects: %10lu (%10lu duplicates )\n", object_count, duplicate_count); fprintf(stderr, " blobs : %10lu (%10lu duplicates %10lu deltas)\n", object_count_by_type[OBJ_BLOB], duplicate_count_by_type[OBJ_BLOB], delta_count_by_type[OBJ_BLOB]); fprintf(stderr, " trees : %10lu (%10lu duplicates %10lu deltas)\n", object_count_by_type[OBJ_TREE], duplicate_count_by_type[OBJ_TREE], delta_count_by_type[OBJ_TREE]); fprintf(stderr, " commits: %10lu (%10lu duplicates %10lu deltas)\n", object_count_by_type[OBJ_COMMIT], duplicate_count_by_type[OBJ_COMMIT], delta_count_by_type[OBJ_COMMIT]); @@ -1837,12 +1838,11 @@ int main(int argc, const char **argv) fprintf(stderr, " pools: %10lu KiB\n", total_allocd/1024); fprintf(stderr, " objects: %10lu KiB\n", (alloc_count*sizeof(struct object_entry))/1024); fprintf(stderr, "Pack remaps: %10lu\n", remap_count); - fprintf(stderr, "---------------------------------------------------\n"); - stat(pack_name, &sb); fprintf(stderr, "Pack size: %10lu KiB\n", (unsigned long)(sb.st_size/1024)); stat(idx_name, &sb); fprintf(stderr, "Index size: %10lu KiB\n", (unsigned long)(sb.st_size/1024)); + fprintf(stderr, "---------------------------------------------------------------------\n"); fprintf(stderr, "\n"); From cacbdd0afb481a6f3019e5e7db98f88e40941fd5 Mon Sep 17 00:00:00 2001 From: "Shawn O. Pearce" Date: Thu, 11 Jan 2007 21:25:01 -0500 Subject: [PATCH 36/81] Fix repository corruption when using marks for modified blobs. Apparently we did not copy the blob SHA1 into the stack variable 'sha1' when a mark is used to refer to a prior blob. This code was not previously tested as the Mozilla CVS -> git-fast-import program always fed us full SHA1s for modified blobs and did not use the mark feature there. Signed-off-by: Shawn O. Pearce --- fast-import.c | 1 + 1 file changed, 1 insertion(+) diff --git a/fast-import.c b/fast-import.c index e35a89f6cd..e9a46c6c3a 100644 --- a/fast-import.c +++ b/fast-import.c @@ -1378,6 +1378,7 @@ static void file_change_m(struct branch *b) if (*p == ':') { char *x; oe = find_mark(strtoul(p + 1, &x, 10)); + hashcpy(sha1, oe->sha1); p = x; } else { if (get_sha1_hex(p, sha1)) From 62b6f48388faf0ac2432a07cfc53aa904c591f8f Mon Sep 17 00:00:00 2001 From: "Shawn O. Pearce" Date: Thu, 11 Jan 2007 22:21:38 -0500 Subject: [PATCH 37/81] Support creation of merge commits in fast-import. Some importers are able to determine when branch merges occurred within their source data. In these cases they will want to supply the correct commits to fast-import so that a proper merge commit will exist in Git. This is now supported by supplying a 'merge ' command after the commit message and optional from command. A merge is not actually performed by fast-import, its assumed that the frontend performed any sort of merging activity already and that fast-import should simply be storing its result. Signed-off-by: Shawn O. Pearce --- fast-import.c | 58 +++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 58 insertions(+) diff --git a/fast-import.c b/fast-import.c index e9a46c6c3a..15db4b39d1 100644 --- a/fast-import.c +++ b/fast-import.c @@ -20,6 +20,7 @@ Format of STDIN stream: 'committer' sp name '<' email '>' ts tz lf commit_msg ('from' sp (ref_str | hexsha1 | sha1exp_str | idnum) lf)? + ('merge' sp (ref_str | hexsha1 | sha1exp_str | idnum) lf)* file_change* lf; commit_msg ::= data; @@ -202,6 +203,11 @@ struct dbuf size_t capacity; }; +struct hash_list +{ + struct hash_list *next; + unsigned char sha1[20]; +}; /* Stats and misc. counters */ static unsigned long max_depth = 10; @@ -1502,6 +1508,48 @@ static void cmd_from(struct branch *b) read_next_command(); } +static struct hash_list* cmd_merge(unsigned int *count) +{ + struct hash_list *list = NULL, *n, *e; + const char *from, *endp; + char *str_uq; + struct branch *s; + + *count = 0; + while (!strncmp("merge ", command_buf.buf, 6)) { + from = strchr(command_buf.buf, ' ') + 1; + str_uq = unquote_c_style(from, &endp); + if (str_uq) { + if (*endp) + die("Garbage after string in: %s", command_buf.buf); + from = str_uq; + } + + n = xmalloc(sizeof(*n)); + s = lookup_branch(from); + if (s) + hashcpy(n->sha1, s->sha1); + else if (*from == ':') { + unsigned long idnum = strtoul(from + 1, NULL, 10); + struct object_entry *oe = find_mark(idnum); + if (oe->type != OBJ_COMMIT) + die("Mark :%lu not a commit", idnum); + hashcpy(n->sha1, oe->sha1); + } else if (get_sha1(from, n->sha1)) + die("Invalid ref name or SHA1 expression: %s", from); + + n->next = NULL; + if (list) + e->next = n; + else + list = n; + e = n; + *count++; + read_next_command(); + } + return list; +} + static void cmd_new_commit() { struct branch *b; @@ -1512,6 +1560,8 @@ static void cmd_new_commit() char *sp; char *author = NULL; char *committer = NULL; + struct hash_list *merge_list = NULL; + unsigned int merge_count; /* Obtain the branch name from the rest of our command */ sp = strchr(command_buf.buf, ' ') + 1; @@ -1542,6 +1592,7 @@ static void cmd_new_commit() msg = cmd_data(&msglen); read_next_command(); cmd_from(b); + merge_list = cmd_merge(&merge_count); /* ensure the branch is active/loaded */ if (!b->branch_tree.tree || !max_active_branches) { @@ -1567,6 +1618,7 @@ static void cmd_new_commit() hashcpy(b->branch_tree.versions[0].sha1, b->branch_tree.versions[1].sha1); size_dbuf(&new_data, 97 + msglen + + merge_count * 49 + (author ? strlen(author) + strlen(committer) : 2 * strlen(committer))); @@ -1575,6 +1627,12 @@ static void cmd_new_commit() sha1_to_hex(b->branch_tree.versions[1].sha1)); if (!is_null_sha1(b->sha1)) sp += sprintf(sp, "parent %s\n", sha1_to_hex(b->sha1)); + while (merge_list) { + struct hash_list *next = merge_list->next; + sp += sprintf(sp, "parent %s\n", sha1_to_hex(merge_list->sha1)); + free(merge_list); + merge_list = next; + } if (author) sp += sprintf(sp, "%s\n", author); else From 9938ffc53a15c755bbd3894c02492b940ea34c4c Mon Sep 17 00:00:00 2001 From: "Shawn O. Pearce" Date: Thu, 11 Jan 2007 22:28:39 -0500 Subject: [PATCH 38/81] Allow creating branches without committing in fast-import. Some importers may want to create a branch long before they actually commit to it, or in some cases they may never commit to the branch but they still need the ref to be created in the repository after the import is complete. This extends the 'reset ' command to automatically create a new branch if the supplied reference isn't already known as a branch. While I'm at it I also modified the syntax of the reset command to terminate with an empty line, like commit and tag operate. This just makes the command set more consistent. Signed-off-by: Shawn O. Pearce --- fast-import.c | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/fast-import.c b/fast-import.c index 15db4b39d1..38e24bf6a6 100644 --- a/fast-import.c +++ b/fast-import.c @@ -36,7 +36,9 @@ Format of STDIN stream: tag_msg; tag_msg ::= data; - reset_branch ::= 'reset' sp ref_str lf; + reset_branch ::= 'reset' sp ref_str lf + ('from' sp (ref_str | hexsha1 | sha1exp_str | idnum) lf)? + lf; # note: the first idnum in a stream should be 1 and subsequent # idnums should not have gaps between values as this will cause @@ -1794,8 +1796,12 @@ static void cmd_reset_branch() b->branch_tree.tree = NULL; } } + else + b = new_branch(sp); if (str_uq) free(str_uq); + read_next_command(); + cmd_from(b); } static const char fast_import_usage[] = From d489bc14919cdd37d3978065591199d21d6719f8 Mon Sep 17 00:00:00 2001 From: "Shawn O. Pearce" Date: Sun, 14 Jan 2007 06:20:23 -0500 Subject: [PATCH 39/81] Improve reuse of sha1_file library within fast-import. Now that the sha1_file.c library routines use the sliding mmap routines to perform efficient access to portions of a packfile I can remove that code from fast-import.c and just invoke it. One benefit is we now have reloading support for any packfile which uses OBJ_OFS_DELTA. Another is we have significantly less code to maintain. This code reuse change *requires* that fast-import generate only an OBJ_OFS_DELTA format packfile, as there is absolutely no index available to perform OBJ_REF_DELTA lookup in while unpacking an object. This is probably reasonable to require as the delta offsets result in smaller packfiles and are faster to unpack, as no index searching is required. Its also only a temporary requirement as users could always repack without offsets before making the import available to older versions of Git. Signed-off-by: Shawn O. Pearce --- fast-import.c | 175 +++++++++----------------------------------------- 1 file changed, 31 insertions(+), 144 deletions(-) diff --git a/fast-import.c b/fast-import.c index 492a8594bf..f0f51a6899 100644 --- a/fast-import.c +++ b/fast-import.c @@ -136,9 +136,9 @@ struct last_object { void *data; unsigned long len; + unsigned long offset; unsigned int depth; - int no_free; - unsigned char sha1[20]; + unsigned no_free:1; }; struct mem_pool @@ -235,13 +235,10 @@ static unsigned int atom_cnt; static struct atom_str **atom_table; /* The .pack file being generated */ +static struct packed_git *pack_data; static int pack_fd; static unsigned long pack_size; static unsigned char pack_sha1[20]; -static unsigned char* pack_base; -static unsigned long pack_moff; -static unsigned long pack_mlen = 128*1024*1024; -static unsigned long page_size; /* Table of objects we've written. */ static unsigned int object_entry_alloc = 5000; @@ -667,14 +664,23 @@ static int store_object( deflateInit(&s, zlib_compression_level); if (delta) { + unsigned long ofs = e->offset - last->offset; + unsigned pos = sizeof(hdr) - 1; + delta_count_by_type[type]++; last->depth++; s.next_in = delta; s.avail_in = deltalen; - hdrlen = encode_header(OBJ_REF_DELTA, deltalen, hdr); + + hdrlen = encode_header(OBJ_OFS_DELTA, deltalen, hdr); write_or_die(pack_fd, hdr, hdrlen); - write_or_die(pack_fd, last->sha1, sizeof(sha1)); - pack_size += hdrlen + sizeof(sha1); + pack_size += hdrlen; + + hdr[pos] = ofs & 127; + while (ofs >>= 7) + hdr[--pos] = 128 | (--ofs & 127); + write_or_die(pack_fd, hdr + pos, sizeof(hdr) - pos); + pack_size += sizeof(hdr) - pos; } else { if (last) last->depth = 0; @@ -701,139 +707,17 @@ static int store_object( if (last->data && !last->no_free) free(last->data); last->data = dat; + last->offset = e->offset; last->len = datlen; - hashcpy(last->sha1, sha1); } return 0; } -static unsigned char* map_pack(unsigned long offset, unsigned int *left) +static void *gfi_unpack_entry(unsigned long ofs, unsigned long *sizep) { - if (offset >= pack_size) - die("object offset outside of pack file"); - if (!pack_base - || offset < pack_moff - || (offset + 20) >= (pack_moff + pack_mlen)) { - if (pack_base) - munmap(pack_base, pack_mlen); - pack_moff = (offset / page_size) * page_size; - pack_base = mmap(NULL,pack_mlen,PROT_READ,MAP_SHARED, - pack_fd,pack_moff); - if (pack_base == MAP_FAILED) - die("Failed to map generated pack: %s", strerror(errno)); - remap_count++; - } - offset -= pack_moff; - if (left) - *left = pack_mlen - offset; - return pack_base + offset; -} - -static unsigned long unpack_object_header(unsigned long offset, - enum object_type *type, - unsigned long *sizep) -{ - unsigned shift; - unsigned char c; - unsigned long size; - - c = *map_pack(offset++, NULL); - *type = (c >> 4) & 7; - size = c & 15; - shift = 4; - while (c & 0x80) { - c = *map_pack(offset++, NULL); - size += (c & 0x7f) << shift; - shift += 7; - } - *sizep = size; - return offset; -} - -static void *unpack_non_delta_entry(unsigned long o, unsigned long sz) -{ - z_stream stream; - unsigned char *result; - - result = xmalloc(sz + 1); - result[sz] = 0; - - memset(&stream, 0, sizeof(stream)); - stream.next_in = map_pack(o, &stream.avail_in); - stream.next_out = result; - stream.avail_out = sz; - - inflateInit(&stream); - for (;;) { - int st = inflate(&stream, Z_FINISH); - if (st == Z_STREAM_END) - break; - if (st == Z_OK || st == Z_BUF_ERROR) { - o = stream.next_in - pack_base + pack_moff; - stream.next_in = map_pack(o, &stream.avail_in); - continue; - } - die("Error %i from zlib during inflate.", st); - } - inflateEnd(&stream); - if (stream.total_out != sz) - die("Error after inflate: sizes mismatch"); - return result; -} - -static void *gfi_unpack_entry(unsigned long offset, - unsigned long *sizep, - unsigned int *delta_depth); - -static void *unpack_delta_entry(unsigned long offset, - unsigned long delta_size, - unsigned long *sizep, - unsigned int *delta_depth) -{ - struct object_entry *base_oe; - unsigned char *base_sha1; - void *delta_data, *base, *result; - unsigned long base_size, result_size; - - base_sha1 = map_pack(offset, NULL); - base_oe = find_object(base_sha1); - if (!base_oe) - die("I'm broken; I can't find a base I know must be here."); - base = gfi_unpack_entry(base_oe->offset, &base_size, delta_depth); - delta_data = unpack_non_delta_entry(offset + 20, delta_size); - result = patch_delta(base, base_size, - delta_data, delta_size, - &result_size); - if (!result) - die("failed to apply delta"); - free(delta_data); - free(base); - *sizep = result_size; - (*delta_depth)++; - return result; -} - -static void *gfi_unpack_entry(unsigned long offset, - unsigned long *sizep, - unsigned int *delta_depth) -{ - unsigned long size; - enum object_type kind; - - offset = unpack_object_header(offset, &kind, &size); - switch (kind) { - case OBJ_REF_DELTA: - return unpack_delta_entry(offset, size, sizep, delta_depth); - case OBJ_COMMIT: - case OBJ_TREE: - case OBJ_BLOB: - case OBJ_TAG: - *sizep = size; - *delta_depth = 0; - return unpack_non_delta_entry(offset, size); - default: - die("I created an object I can't read!"); - } + char type[20]; + pack_data->pack_size = pack_size + 20; + return unpack_entry(pack_data, ofs, type, sizep); } static const char *get_mode(const char *str, unsigned int *modep) @@ -867,7 +751,8 @@ static void load_tree(struct tree_entry *root) if (myoe) { if (myoe->type != OBJ_TREE) die("Not a tree: %s", sha1_to_hex(sha1)); - buf = gfi_unpack_entry(myoe->offset, &size, &t->delta_depth); + t->delta_depth = 0; + buf = gfi_unpack_entry(myoe->offset, &size); } else { char type[20]; buf = read_sha1_file(sha1, type, &size); @@ -956,6 +841,7 @@ static void store_tree(struct tree_entry *root) unsigned int i, j, del; unsigned long new_len; struct last_object lo; + struct object_entry *le; if (!is_null_sha1(root->versions[1].sha1)) return; @@ -965,17 +851,16 @@ static void store_tree(struct tree_entry *root) store_tree(t->entries[i]); } - if (!S_ISDIR(root->versions[0].mode) - || is_null_sha1(root->versions[0].sha1) - || !find_object(root->versions[0].sha1)) { + le = find_object(root->versions[0].sha1); + if (!S_ISDIR(root->versions[0].mode) || !le) { lo.data = NULL; lo.depth = 0; } else { mktree(t, 0, &lo.len, &old_tree); lo.data = old_tree.buffer; + lo.offset = le->offset; lo.depth = t->delta_depth; lo.no_free = 1; - hashcpy(lo.sha1, root->versions[0].sha1); } mktree(t, 1, &new_len, &new_tree); @@ -1471,12 +1356,11 @@ static void cmd_from(struct branch *b) unsigned long idnum = strtoul(from + 1, NULL, 10); struct object_entry *oe = find_mark(idnum); unsigned long size; - unsigned int depth; char *buf; if (oe->type != OBJ_COMMIT) die("Mark :%lu not a commit", idnum); hashcpy(b->sha1, oe->sha1); - buf = gfi_unpack_entry(oe->offset, &size, &depth); + buf = gfi_unpack_entry(oe->offset, &size); if (!buf || size < 46) die("Not a valid commit: %s", from); if (memcmp("tree ", buf, 5) @@ -1818,7 +1702,6 @@ int main(int argc, const char **argv) setup_ident(); git_config(git_default_config); - page_size = getpagesize(); for (i = 1; i < argc; i++) { const char *a = argv[i]; @@ -1854,6 +1737,10 @@ int main(int argc, const char **argv) if (pack_fd < 0) die("Can't create %s: %s", pack_name, strerror(errno)); + pack_data = xcalloc(1, sizeof(*pack_data) + strlen(pack_name) + 2); + strcpy(pack_data->pack_name, pack_name); + pack_data->pack_fd = pack_fd; + init_pack_header(); alloc_objects(est_obj_cnt); strbuf_init(&command_buf); From 03842d8e24face522fa0ca846283da33e747e4f0 Mon Sep 17 00:00:00 2001 From: "Shawn O. Pearce" Date: Mon, 15 Jan 2007 00:16:23 -0500 Subject: [PATCH 40/81] Misc. type cleanups within fast-import. Signed-off-by: Shawn O. Pearce --- fast-import.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/fast-import.c b/fast-import.c index f0f51a6899..3a98cb848f 100644 --- a/fast-import.c +++ b/fast-import.c @@ -110,8 +110,8 @@ Format of STDIN stream: struct object_entry { struct object_entry *next; - enum object_type type; unsigned long offset; + unsigned type : TYPE_BITS; unsigned char sha1[20]; }; @@ -220,9 +220,9 @@ static unsigned long remap_count; static unsigned long object_count; static unsigned long duplicate_count; static unsigned long marks_set_count; -static unsigned long object_count_by_type[9]; -static unsigned long duplicate_count_by_type[9]; -static unsigned long delta_count_by_type[9]; +static unsigned long object_count_by_type[1 << TYPE_BITS]; +static unsigned long duplicate_count_by_type[1 << TYPE_BITS]; +static unsigned long delta_count_by_type[1 << TYPE_BITS]; /* Memory pools */ static size_t mem_pool_alloc = 2*1024*1024 - sizeof(struct mem_pool); @@ -276,7 +276,7 @@ static struct dbuf new_data; static FILE* branch_log; -static void alloc_objects(int cnt) +static void alloc_objects(unsigned int cnt) { struct object_entry_pool *b; From f70b653429ebc7fdde0b36a63e1deb4aadb450d3 Mon Sep 17 00:00:00 2001 From: "Shawn O. Pearce" Date: Mon, 15 Jan 2007 04:39:05 -0500 Subject: [PATCH 41/81] Restructure fast-import to support creating multiple packfiles. Now that we are starting to see some really large projects (such as KDE or a fork of FreeBSD) get imported into Git we're running into the upper limit on packfile object count as well as overall byte length. The KDE and FreeBSD projects are both likely to require more than 4 GiB to store their current history, which means we really need multiple packfiles to handle their content. This is a fairly simple restructuring of the internal code to help us support creating multiple packfiles from within fast-import. We are now adding a 5 digit incrementing suffix to the end of the basename supplied to us by the caller, permitting up to 99,999 packs to be generated in a single fast-import run. Signed-off-by: Shawn O. Pearce --- fast-import.c | 245 +++++++++++++++++++++++++------------------------- 1 file changed, 124 insertions(+), 121 deletions(-) diff --git a/fast-import.c b/fast-import.c index 3a98cb848f..fc8567e9f6 100644 --- a/fast-import.c +++ b/fast-import.c @@ -216,7 +216,6 @@ static unsigned long max_depth = 10; static unsigned long alloc_count; static unsigned long branch_count; static unsigned long branch_load_count; -static unsigned long remap_count; static unsigned long object_count; static unsigned long duplicate_count; static unsigned long marks_set_count; @@ -235,6 +234,10 @@ static unsigned int atom_cnt; static struct atom_str **atom_table; /* The .pack file being generated */ +static const char *base_name; +static unsigned int pack_count; +static char *pack_name; +static char *idx_name; static struct packed_git *pack_data; static int pack_fd; static unsigned long pack_size; @@ -593,6 +596,124 @@ static void yread(int fd, void *buffer, size_t length) } } +static void start_packfile() +{ + struct pack_header hdr; + + pack_count++; + pack_name = xmalloc(strlen(base_name) + 11); + idx_name = xmalloc(strlen(base_name) + 11); + sprintf(pack_name, "%s%5.5i.pack", base_name, pack_count); + sprintf(idx_name, "%s%5.5i.idx", base_name, pack_count); + + pack_fd = open(pack_name, O_RDWR|O_CREAT|O_EXCL, 0666); + if (pack_fd < 0) + die("Can't create %s: %s", pack_name, strerror(errno)); + + pack_data = xcalloc(1, sizeof(*pack_data) + strlen(pack_name) + 2); + strcpy(pack_data->pack_name, pack_name); + pack_data->pack_fd = pack_fd; + + hdr.hdr_signature = htonl(PACK_SIGNATURE); + hdr.hdr_version = htonl(2); + hdr.hdr_entries = 0; + + write_or_die(pack_fd, &hdr, sizeof(hdr)); + pack_size = sizeof(hdr); + object_count = 0; +} + +static void fixup_header_footer() +{ + SHA_CTX c; + char hdr[8]; + unsigned long cnt; + char *buf; + + if (lseek(pack_fd, 0, SEEK_SET) != 0) + die("Failed seeking to start: %s", strerror(errno)); + + SHA1_Init(&c); + yread(pack_fd, hdr, 8); + SHA1_Update(&c, hdr, 8); + + cnt = htonl(object_count); + SHA1_Update(&c, &cnt, 4); + write_or_die(pack_fd, &cnt, 4); + + buf = xmalloc(128 * 1024); + for (;;) { + size_t n = xread(pack_fd, buf, 128 * 1024); + if (n <= 0) + break; + SHA1_Update(&c, buf, n); + } + free(buf); + + SHA1_Final(pack_sha1, &c); + write_or_die(pack_fd, pack_sha1, sizeof(pack_sha1)); +} + +static int oecmp (const void *a_, const void *b_) +{ + struct object_entry *a = *((struct object_entry**)a_); + struct object_entry *b = *((struct object_entry**)b_); + return hashcmp(a->sha1, b->sha1); +} + +static void write_index(const char *idx_name) +{ + struct sha1file *f; + struct object_entry **idx, **c, **last, *e; + struct object_entry_pool *o; + unsigned int array[256]; + int i; + + /* Build the sorted table of object IDs. */ + idx = xmalloc(object_count * sizeof(struct object_entry*)); + c = idx; + for (o = blocks; o; o = o->next_pool) + for (e = o->entries; e != o->next_free; e++) + *c++ = e; + last = idx + object_count; + qsort(idx, object_count, sizeof(struct object_entry*), oecmp); + + /* Generate the fan-out array. */ + c = idx; + for (i = 0; i < 256; i++) { + struct object_entry **next = c;; + while (next < last) { + if ((*next)->sha1[0] != i) + break; + next++; + } + array[i] = htonl(next - idx); + c = next; + } + + f = sha1create("%s", idx_name); + sha1write(f, array, 256 * sizeof(int)); + for (c = idx; c != last; c++) { + unsigned int offset = htonl((*c)->offset); + sha1write(f, &offset, 4); + sha1write(f, (*c)->sha1, sizeof((*c)->sha1)); + } + sha1write(f, pack_sha1, sizeof(pack_sha1)); + sha1close(f, NULL, 1); + free(idx); +} + +static void end_packfile() +{ + fixup_header_footer(); + close(pack_fd); + write_index(idx_name); + + free(pack_name); + free(idx_name); + free(pack_data); +} + static size_t encode_header( enum object_type type, size_t size, @@ -994,100 +1115,6 @@ del_entry: return 1; } -static void init_pack_header() -{ - struct pack_header hdr; - - hdr.hdr_signature = htonl(PACK_SIGNATURE); - hdr.hdr_version = htonl(2); - hdr.hdr_entries = 0; - - write_or_die(pack_fd, &hdr, sizeof(hdr)); - pack_size = sizeof(hdr); -} - -static void fixup_header_footer() -{ - SHA_CTX c; - char hdr[8]; - unsigned long cnt; - char *buf; - size_t n; - - if (lseek(pack_fd, 0, SEEK_SET) != 0) - die("Failed seeking to start: %s", strerror(errno)); - - SHA1_Init(&c); - yread(pack_fd, hdr, 8); - SHA1_Update(&c, hdr, 8); - - cnt = htonl(object_count); - SHA1_Update(&c, &cnt, 4); - write_or_die(pack_fd, &cnt, 4); - - buf = xmalloc(128 * 1024); - for (;;) { - n = xread(pack_fd, buf, 128 * 1024); - if (n <= 0) - break; - SHA1_Update(&c, buf, n); - } - free(buf); - - SHA1_Final(pack_sha1, &c); - write_or_die(pack_fd, pack_sha1, sizeof(pack_sha1)); -} - -static int oecmp (const void *_a, const void *_b) -{ - struct object_entry *a = *((struct object_entry**)_a); - struct object_entry *b = *((struct object_entry**)_b); - return hashcmp(a->sha1, b->sha1); -} - -static void write_index(const char *idx_name) -{ - struct sha1file *f; - struct object_entry **idx, **c, **last; - struct object_entry *e; - struct object_entry_pool *o; - unsigned int array[256]; - int i; - - /* Build the sorted table of object IDs. */ - idx = xmalloc(object_count * sizeof(struct object_entry*)); - c = idx; - for (o = blocks; o; o = o->next_pool) - for (e = o->entries; e != o->next_free; e++) - *c++ = e; - last = idx + object_count; - qsort(idx, object_count, sizeof(struct object_entry*), oecmp); - - /* Generate the fan-out array. */ - c = idx; - for (i = 0; i < 256; i++) { - struct object_entry **next = c;; - while (next < last) { - if ((*next)->sha1[0] != i) - break; - next++; - } - array[i] = htonl(next - idx); - c = next; - } - - f = sha1create("%s", idx_name); - sha1write(f, array, 256 * sizeof(int)); - for (c = idx; c != last; c++) { - unsigned int offset = htonl((*c)->offset); - sha1write(f, &offset, 4); - sha1write(f, (*c)->sha1, sizeof((*c)->sha1)); - } - sha1write(f, pack_sha1, sizeof(pack_sha1)); - sha1close(f, NULL, 1); - free(idx); -} - static void dump_branches() { static const char *msg = "fast-import"; @@ -1693,11 +1720,8 @@ static const char fast_import_usage[] = int main(int argc, const char **argv) { - const char *base_name; int i; unsigned long est_obj_cnt = object_entry_alloc; - char *pack_name; - char *idx_name; struct stat sb; setup_ident(); @@ -1728,20 +1752,6 @@ int main(int argc, const char **argv) usage(fast_import_usage); base_name = argv[i]; - pack_name = xmalloc(strlen(base_name) + 6); - sprintf(pack_name, "%s.pack", base_name); - idx_name = xmalloc(strlen(base_name) + 5); - sprintf(idx_name, "%s.idx", base_name); - - pack_fd = open(pack_name, O_RDWR|O_CREAT|O_EXCL, 0666); - if (pack_fd < 0) - die("Can't create %s: %s", pack_name, strerror(errno)); - - pack_data = xcalloc(1, sizeof(*pack_data) + strlen(pack_name) + 2); - strcpy(pack_data->pack_name, pack_name); - pack_data->pack_fd = pack_fd; - - init_pack_header(); alloc_objects(est_obj_cnt); strbuf_init(&command_buf); @@ -1750,6 +1760,7 @@ int main(int argc, const char **argv) avail_tree_table = xcalloc(avail_tree_table_sz, sizeof(struct avail_tree_content*)); marks = pool_calloc(1, sizeof(struct mark_set)); + start_packfile(); for (;;) { read_next_command(); if (command_buf.eof) @@ -1765,10 +1776,8 @@ int main(int argc, const char **argv) else die("Unsupported command: %s", command_buf.buf); } + end_packfile(); - fixup_header_footer(); - close(pack_fd); - write_index(idx_name); dump_branches(); dump_tags(); dump_marks(); @@ -1789,13 +1798,7 @@ int main(int argc, const char **argv) fprintf(stderr, "Memory total: %10lu KiB\n", (total_allocd + alloc_count*sizeof(struct object_entry))/1024); fprintf(stderr, " pools: %10lu KiB\n", total_allocd/1024); fprintf(stderr, " objects: %10lu KiB\n", (alloc_count*sizeof(struct object_entry))/1024); - fprintf(stderr, "Pack remaps: %10lu\n", remap_count); - stat(pack_name, &sb); - fprintf(stderr, "Pack size: %10lu KiB\n", (unsigned long)(sb.st_size/1024)); - stat(idx_name, &sb); - fprintf(stderr, "Index size: %10lu KiB\n", (unsigned long)(sb.st_size/1024)); fprintf(stderr, "---------------------------------------------------------------------\n"); - fprintf(stderr, "\n"); return 0; From 80144727acc401070039434987692276dcb9273c Mon Sep 17 00:00:00 2001 From: "Shawn O. Pearce" Date: Mon, 15 Jan 2007 05:03:32 -0500 Subject: [PATCH 42/81] Remove unnecessary duplicate_count in fast-import. There is little reason to be keeping a global duplicate_count value when we also keep it per object type. The global counter can easily be computed at the end, once all processing has completed. This saves us a couple of machine instructions in an unimportant part of code. But it looks slightly better to me to not keep two counters around. Signed-off-by: Shawn O. Pearce --- fast-import.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/fast-import.c b/fast-import.c index fc8567e9f6..12127168bd 100644 --- a/fast-import.c +++ b/fast-import.c @@ -217,7 +217,6 @@ static unsigned long alloc_count; static unsigned long branch_count; static unsigned long branch_load_count; static unsigned long object_count; -static unsigned long duplicate_count; static unsigned long marks_set_count; static unsigned long object_count_by_type[1 << TYPE_BITS]; static unsigned long duplicate_count_by_type[1 << TYPE_BITS]; @@ -765,7 +764,6 @@ static int store_object( if (mark) insert_mark(mark, e); if (e->offset) { - duplicate_count++; duplicate_count_by_type[type]++; return 1; } @@ -1722,7 +1720,7 @@ int main(int argc, const char **argv) { int i; unsigned long est_obj_cnt = object_entry_alloc; - struct stat sb; + unsigned long duplicate_count; setup_ident(); git_config(git_default_config); @@ -1784,6 +1782,9 @@ int main(int argc, const char **argv) if (branch_log) fclose(branch_log); + for (i = 0; i < ARRAY_SIZE(duplicate_count_by_type); i++) + duplicate_count += duplicate_count_by_type[i]; + fprintf(stderr, "%s statistics:\n", argv[0]); fprintf(stderr, "---------------------------------------------------------------------\n"); fprintf(stderr, "Alloc'd objects: %10lu (%10lu overflow )\n", alloc_count, alloc_count - est_obj_cnt); From 7bfe6e261378a30980886994dabc0e7e4c9ce3d8 Mon Sep 17 00:00:00 2001 From: "Shawn O. Pearce" Date: Mon, 15 Jan 2007 06:35:41 -0500 Subject: [PATCH 43/81] Implemented manual packfile switching in fast-import. To help importers which are dealing with massive amounts of data fast-import needs to be able to close the packfile it is currently writing to and open a new packfile for any additional data that will be received. A new 'checkpoint' command has been introduced which can be used by the frontend import process to force this to occur at any time. This may be useful to ensure a very long running import doesn't lose any work due to unexpected failures. Signed-off-by: Shawn O. Pearce --- fast-import.c | 89 +++++++++++++++++++++++++++++++++++++-------------- 1 file changed, 65 insertions(+), 24 deletions(-) diff --git a/fast-import.c b/fast-import.c index 12127168bd..3f747be287 100644 --- a/fast-import.c +++ b/fast-import.c @@ -40,6 +40,9 @@ Format of STDIN stream: ('from' sp (ref_str | hexsha1 | sha1exp_str | idnum) lf)? lf; + checkpoint ::= 'checkpoint' lf + lf; + # note: the first idnum in a stream should be 1 and subsequent # idnums should not have gaps between values as this will cause # the stream parser to reserve space for the gapped values. An @@ -112,6 +115,7 @@ struct object_entry struct object_entry *next; unsigned long offset; unsigned type : TYPE_BITS; + unsigned pack_id : 16; unsigned char sha1[20]; }; @@ -234,10 +238,10 @@ static struct atom_str **atom_table; /* The .pack file being generated */ static const char *base_name; -static unsigned int pack_count; -static char *pack_name; +static unsigned int pack_id; static char *idx_name; static struct packed_git *pack_data; +static struct packed_git **all_packs; static int pack_fd; static unsigned long pack_size; static unsigned char pack_sha1[20]; @@ -299,6 +303,7 @@ static struct object_entry* new_object(unsigned char *sha1) alloc_objects(object_entry_alloc); e = blocks->next_free++; + e->pack_id = pack_id; hashcpy(e->sha1, sha1); return e; } @@ -597,29 +602,30 @@ static void yread(int fd, void *buffer, size_t length) static void start_packfile() { + struct packed_git *p; struct pack_header hdr; - pack_count++; - pack_name = xmalloc(strlen(base_name) + 11); idx_name = xmalloc(strlen(base_name) + 11); - sprintf(pack_name, "%s%5.5i.pack", base_name, pack_count); - sprintf(idx_name, "%s%5.5i.idx", base_name, pack_count); + p = xcalloc(1, sizeof(*p) + strlen(base_name) + 13); + sprintf(p->pack_name, "%s%5.5i.pack", base_name, pack_id + 1); + sprintf(idx_name, "%s%5.5i.idx", base_name, pack_id + 1); - pack_fd = open(pack_name, O_RDWR|O_CREAT|O_EXCL, 0666); + pack_fd = open(p->pack_name, O_RDWR|O_CREAT|O_EXCL, 0666); if (pack_fd < 0) - die("Can't create %s: %s", pack_name, strerror(errno)); - - pack_data = xcalloc(1, sizeof(*pack_data) + strlen(pack_name) + 2); - strcpy(pack_data->pack_name, pack_name); - pack_data->pack_fd = pack_fd; + die("Can't create %s: %s", p->pack_name, strerror(errno)); + p->pack_fd = pack_fd; hdr.hdr_signature = htonl(PACK_SIGNATURE); hdr.hdr_version = htonl(2); hdr.hdr_entries = 0; - write_or_die(pack_fd, &hdr, sizeof(hdr)); + + pack_data = p; pack_size = sizeof(hdr); object_count = 0; + + all_packs = xrealloc(all_packs, sizeof(*all_packs) * (pack_id + 1)); + all_packs[pack_id] = p; } static void fixup_header_footer() @@ -673,7 +679,8 @@ static void write_index(const char *idx_name) c = idx; for (o = blocks; o; o = o->next_pool) for (e = o->entries; e != o->next_free; e++) - *c++ = e; + if (pack_id == e->pack_id) + *c++ = e; last = idx + object_count; qsort(idx, object_count, sizeof(struct object_entry*), oecmp); @@ -704,13 +711,28 @@ static void write_index(const char *idx_name) static void end_packfile() { + struct packed_git *old_p = pack_data, *new_p; + fixup_header_footer(); - close(pack_fd); write_index(idx_name); - free(pack_name); + /* Register the packfile with core git's machinary. */ + new_p = add_packed_git(idx_name, strlen(idx_name), 1); + if (!new_p) + die("core git rejected index %s", idx_name); + new_p->windows = old_p->windows; + new_p->pack_fd = old_p->pack_fd; + all_packs[pack_id++] = new_p; + install_packed_git(new_p); + free(old_p); free(idx_name); - free(pack_data); + + /* We can't carry a delta across packfiles. */ + free(last_blob.data); + last_blob.data = NULL; + last_blob.len = 0; + last_blob.offset = 0; + last_blob.depth = 0; } static size_t encode_header( @@ -832,11 +854,15 @@ static int store_object( return 0; } -static void *gfi_unpack_entry(unsigned long ofs, unsigned long *sizep) +static void *gfi_unpack_entry( + struct object_entry *oe, + unsigned long *sizep) { - char type[20]; - pack_data->pack_size = pack_size + 20; - return unpack_entry(pack_data, ofs, type, sizep); + static char type[20]; + struct packed_git *p = all_packs[oe->pack_id]; + if (p == pack_data) + p->pack_size = pack_size + 20; + return unpack_entry(p, oe->offset, type, sizep); } static const char *get_mode(const char *str, unsigned int *modep) @@ -871,7 +897,7 @@ static void load_tree(struct tree_entry *root) if (myoe->type != OBJ_TREE) die("Not a tree: %s", sha1_to_hex(sha1)); t->delta_depth = 0; - buf = gfi_unpack_entry(myoe->offset, &size); + buf = gfi_unpack_entry(myoe, &size); } else { char type[20]; buf = read_sha1_file(sha1, type, &size); @@ -971,7 +997,9 @@ static void store_tree(struct tree_entry *root) } le = find_object(root->versions[0].sha1); - if (!S_ISDIR(root->versions[0].mode) || !le) { + if (!S_ISDIR(root->versions[0].mode) + || !le + || le->pack_id != pack_id) { lo.data = NULL; lo.depth = 0; } else { @@ -1385,7 +1413,7 @@ static void cmd_from(struct branch *b) if (oe->type != OBJ_COMMIT) die("Mark :%lu not a commit", idnum); hashcpy(b->sha1, oe->sha1); - buf = gfi_unpack_entry(oe->offset, &size); + buf = gfi_unpack_entry(oe, &size); if (!buf || size < 46) die("Not a valid commit: %s", from); if (memcmp("tree ", buf, 5) @@ -1713,6 +1741,15 @@ static void cmd_reset_branch() cmd_from(b); } +static void cmd_checkpoint() +{ + if (object_count) { + end_packfile(); + start_packfile(); + } + read_next_command(); +} + static const char fast_import_usage[] = "git-fast-import [--objects=n] [--depth=n] [--active-branches=n] [--export-marks=marks.file] [--branch-log=log] temp.pack"; @@ -1771,6 +1808,8 @@ int main(int argc, const char **argv) cmd_new_tag(); else if (!strncmp("reset ", command_buf.buf, 6)) cmd_reset_branch(); + else if (!strcmp("checkpoint", command_buf.buf)) + cmd_checkpoint(); else die("Unsupported command: %s", command_buf.buf); } @@ -1800,6 +1839,8 @@ int main(int argc, const char **argv) fprintf(stderr, " pools: %10lu KiB\n", total_allocd/1024); fprintf(stderr, " objects: %10lu KiB\n", (alloc_count*sizeof(struct object_entry))/1024); fprintf(stderr, "---------------------------------------------------------------------\n"); + pack_report(); + fprintf(stderr, "---------------------------------------------------------------------\n"); fprintf(stderr, "\n"); return 0; From 3e005baf8542a3116e51c4b0a27b72c7e14d949b Mon Sep 17 00:00:00 2001 From: "Shawn O. Pearce" Date: Mon, 15 Jan 2007 06:39:39 -0500 Subject: [PATCH 44/81] Don't create a final empty packfile in fast-import. If the last packfile is going to be empty (has 0 objects) then it shouldn't be kept after the import has terminated, as there is no point to the packfile. So rather than hashing it and making the index file, just delete the packfile. Signed-off-by: Shawn O. Pearce --- fast-import.c | 26 ++++++++++++++++---------- 1 file changed, 16 insertions(+), 10 deletions(-) diff --git a/fast-import.c b/fast-import.c index 3f747be287..207acb3230 100644 --- a/fast-import.c +++ b/fast-import.c @@ -713,17 +713,23 @@ static void end_packfile() { struct packed_git *old_p = pack_data, *new_p; - fixup_header_footer(); - write_index(idx_name); + if (object_count) { + fixup_header_footer(); + write_index(idx_name); - /* Register the packfile with core git's machinary. */ - new_p = add_packed_git(idx_name, strlen(idx_name), 1); - if (!new_p) - die("core git rejected index %s", idx_name); - new_p->windows = old_p->windows; - new_p->pack_fd = old_p->pack_fd; - all_packs[pack_id++] = new_p; - install_packed_git(new_p); + /* Register the packfile with core git's machinary. */ + new_p = add_packed_git(idx_name, strlen(idx_name), 1); + if (!new_p) + die("core git rejected index %s", idx_name); + new_p->windows = old_p->windows; + new_p->pack_fd = old_p->pack_fd; + all_packs[pack_id++] = new_p; + install_packed_git(new_p); + } + else { + close(pack_fd); + unlink(old_p->pack_name); + } free(old_p); free(idx_name); From 2fce1f3c862845d23b2bd8305f97abb115623192 Mon Sep 17 00:00:00 2001 From: "Shawn O. Pearce" Date: Mon, 15 Jan 2007 06:51:58 -0500 Subject: [PATCH 45/81] Optimize index creation on large object sets in fast-import. When we are generating multiple packfiles at once we only need to scan the blocks of object_entry structs which contain objects for the current packfile. Because the most recent blocks are at the front of the linked list, and because all new objects going into the current file are allocated from the front of that list, we can stop scanning for objects as soon as we identify one which doesn't belong to the current packfile. Signed-off-by: Shawn O. Pearce --- fast-import.c | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/fast-import.c b/fast-import.c index 207acb3230..cfadda0432 100644 --- a/fast-import.c +++ b/fast-import.c @@ -678,10 +678,15 @@ static void write_index(const char *idx_name) idx = xmalloc(object_count * sizeof(struct object_entry*)); c = idx; for (o = blocks; o; o = o->next_pool) - for (e = o->entries; e != o->next_free; e++) - if (pack_id == e->pack_id) - *c++ = e; + for (e = o->next_free; e-- != o->entries;) { + if (pack_id != e->pack_id) + goto sort_index; + *c++ = e; + } +sort_index: last = idx + object_count; + if (c != last) + die("internal consistency error creating the index"); qsort(idx, object_count, sizeof(struct object_entry*), oecmp); /* Generate the fan-out array. */ From d9ee53ce45b0f1c26285417b900b3c5735721f7e Mon Sep 17 00:00:00 2001 From: "Shawn O. Pearce" Date: Mon, 15 Jan 2007 08:00:49 -0500 Subject: [PATCH 46/81] Implemented automatic checkpoints within fast-import. When the number of objects or number of bytes gets close to the limit allowed by the packfile format (or configured on the command line by our caller) we should automatically checkpoint the current packfile and start a new one before writing the object out. This does however require that we abandon the delta (if we had one) as its not valid in a new packfile. I also added the simple rule that if we got a delta back but the delta itself is the same size as or larger than the uncompressed object to ignore the delta and just store the object data. This should avoid some really bad behavior caused by our current delta strategy. Signed-off-by: Shawn O. Pearce --- fast-import.c | 96 ++++++++++++++++++++++++++++++++++++--------------- 1 file changed, 68 insertions(+), 28 deletions(-) diff --git a/fast-import.c b/fast-import.c index cfadda0432..c19567f68c 100644 --- a/fast-import.c +++ b/fast-import.c @@ -217,6 +217,8 @@ struct hash_list /* Stats and misc. counters */ static unsigned long max_depth = 10; +static unsigned long max_objects = -1; +static unsigned long max_packsize = -1; static unsigned long alloc_count; static unsigned long branch_count; static unsigned long branch_load_count; @@ -303,7 +305,6 @@ static struct object_entry* new_object(unsigned char *sha1) alloc_objects(object_entry_alloc); e = blocks->next_free++; - e->pack_id = pack_id; hashcpy(e->sha1, sha1); return e; } @@ -678,12 +679,9 @@ static void write_index(const char *idx_name) idx = xmalloc(object_count * sizeof(struct object_entry*)); c = idx; for (o = blocks; o; o = o->next_pool) - for (e = o->next_free; e-- != o->entries;) { - if (pack_id != e->pack_id) - goto sort_index; - *c++ = e; - } -sort_index: + for (e = o->next_free; e-- != o->entries;) + if (pack_id == e->pack_id) + *c++ = e; last = idx + object_count; if (c != last) die("internal consistency error creating the index"); @@ -746,6 +744,12 @@ static void end_packfile() last_blob.depth = 0; } +static void checkpoint() +{ + end_packfile(); + start_packfile(); +} + static size_t encode_header( enum object_type type, size_t size, @@ -800,20 +804,64 @@ static int store_object( duplicate_count_by_type[type]++; return 1; } - e->type = type; - e->offset = pack_size; - object_count++; - object_count_by_type[type]++; - if (last && last->data && last->depth < max_depth) + if (last && last->data && last->depth < max_depth) { delta = diff_delta(last->data, last->len, dat, datlen, &deltalen, 0); - else - delta = 0; + if (delta && deltalen >= datlen) { + free(delta); + delta = NULL; + } + } else + delta = NULL; memset(&s, 0, sizeof(s)); deflateInit(&s, zlib_compression_level); + if (delta) { + s.next_in = delta; + s.avail_in = deltalen; + } else { + s.next_in = dat; + s.avail_in = datlen; + } + s.avail_out = deflateBound(&s, s.avail_in); + s.next_out = out = xmalloc(s.avail_out); + while (deflate(&s, Z_FINISH) == Z_OK) + /* nothing */; + deflateEnd(&s); + + /* Determine if we should auto-checkpoint. */ + if ((object_count + 1) > max_objects + || (object_count + 1) < object_count + || (pack_size + 60 + s.total_out) > max_packsize + || (pack_size + 60 + s.total_out) < pack_size) { + + /* This new object needs to *not* have the current pack_id. */ + e->pack_id = pack_id + 1; + checkpoint(); + + /* We cannot carry a delta into the new pack. */ + if (delta) { + free(delta); + delta = NULL; + } + memset(&s, 0, sizeof(s)); + deflateInit(&s, zlib_compression_level); + s.next_in = dat; + s.avail_in = datlen; + s.avail_out = deflateBound(&s, s.avail_in); + s.next_out = out; + while (deflate(&s, Z_FINISH) == Z_OK) + /* nothing */; + deflateEnd(&s); + } + + e->type = type; + e->pack_id = pack_id; + e->offset = pack_size; + object_count++; + object_count_by_type[type]++; if (delta) { unsigned long ofs = e->offset - last->offset; @@ -821,8 +869,6 @@ static int store_object( delta_count_by_type[type]++; last->depth++; - s.next_in = delta; - s.avail_in = deltalen; hdrlen = encode_header(OBJ_OFS_DELTA, deltalen, hdr); write_or_die(pack_fd, hdr, hdrlen); @@ -836,19 +882,11 @@ static int store_object( } else { if (last) last->depth = 0; - s.next_in = dat; - s.avail_in = datlen; hdrlen = encode_header(type, datlen, hdr); write_or_die(pack_fd, hdr, hdrlen); pack_size += hdrlen; } - s.avail_out = deflateBound(&s, s.avail_in); - s.next_out = out = xmalloc(s.avail_out); - while (deflate(&s, Z_FINISH) == Z_OK) - /* nothing */; - deflateEnd(&s); - write_or_die(pack_fd, out, s.total_out); pack_size += s.total_out; @@ -1754,10 +1792,8 @@ static void cmd_reset_branch() static void cmd_checkpoint() { - if (object_count) { - end_packfile(); - start_packfile(); - } + if (object_count) + checkpoint(); read_next_command(); } @@ -1780,6 +1816,10 @@ int main(int argc, const char **argv) break; else if (!strncmp(a, "--objects=", 10)) est_obj_cnt = strtoul(a + 10, NULL, 0); + else if (!strncmp(a, "--max-objects-per-pack=", 23)) + max_objects = strtoul(a + 23, NULL, 0); + else if (!strncmp(a, "--max-pack-size=", 16)) + max_packsize = strtoul(a + 16, NULL, 0) * 1024 * 1024; else if (!strncmp(a, "--depth=", 8)) max_depth = strtoul(a + 8, NULL, 0); else if (!strncmp(a, "--active-branches=", 18)) From 9d1b1b5ed7f4234ea4f2c1344ba67c6f89e2067c Mon Sep 17 00:00:00 2001 From: "Shawn O. Pearce" Date: Mon, 15 Jan 2007 08:03:38 -0500 Subject: [PATCH 47/81] Print the packfile names to stdout from fast-import. Caller scripts may want to know what packfiles the fast-import process just wrote out for them. This is now output to stdout, one packfile name per line, after we checkpoint each packfile. Signed-off-by: Shawn O. Pearce --- fast-import.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/fast-import.c b/fast-import.c index c19567f68c..19d01e20ad 100644 --- a/fast-import.c +++ b/fast-import.c @@ -719,6 +719,8 @@ static void end_packfile() if (object_count) { fixup_header_footer(); write_index(idx_name); + fprintf(stdout, "%s\n", old_p->pack_name); + fflush(stdout); /* Register the packfile with core git's machinary. */ new_p = add_packed_git(idx_name, strlen(idx_name), 1); From 5d6f3ef6413172388ee5e6090afe9802a30a59f0 Mon Sep 17 00:00:00 2001 From: "Shawn O. Pearce" Date: Mon, 15 Jan 2007 23:40:27 -0500 Subject: [PATCH 48/81] Corrected buffer overflow during automatic checkpoint in fast-import. If we previously were using a delta but we needed to checkpoint the current packfile and switch to a new packfile we need to throw away the delta and compress the raw object by itself, as delta chains cannot span non-thin packfiles. Unfortunately the output buffer in this case needs to grow, as the size of the compressed object may be quite a bit larger than the size of the compressed delta. I've also avoided recompressing the object if we are checkpointing and we didn't use a delta. In this case the output buffer is the correct size and has already been populated with the right data, we just need to close out the current packfile and open a new one. Signed-off-by: Shawn O. Pearce --- fast-import.c | 19 ++++++++++--------- 1 file changed, 10 insertions(+), 9 deletions(-) diff --git a/fast-import.c b/fast-import.c index 19d01e20ad..57d857c386 100644 --- a/fast-import.c +++ b/fast-import.c @@ -847,16 +847,17 @@ static int store_object( if (delta) { free(delta); delta = NULL; + + memset(&s, 0, sizeof(s)); + deflateInit(&s, zlib_compression_level); + s.next_in = dat; + s.avail_in = datlen; + s.avail_out = deflateBound(&s, s.avail_in); + s.next_out = out = xrealloc(out, s.avail_out); + while (deflate(&s, Z_FINISH) == Z_OK) + /* nothing */; + deflateEnd(&s); } - memset(&s, 0, sizeof(s)); - deflateInit(&s, zlib_compression_level); - s.next_in = dat; - s.avail_in = datlen; - s.avail_out = deflateBound(&s, s.avail_in); - s.next_out = out; - while (deflate(&s, Z_FINISH) == Z_OK) - /* nothing */; - deflateEnd(&s); } e->type = type; From 0ea9f045f4eaa1d37c6b318d9d6849a4f447b997 Mon Sep 17 00:00:00 2001 From: "Shawn O. Pearce" Date: Tue, 16 Jan 2007 00:33:19 -0500 Subject: [PATCH 49/81] Use uintmax_t for marks in fast-import. If a frontend wants to use a mark per file revision and per commit and is doing a truly huge import (such as a 32 GiB SVN repository) we may need more than 2**32 unique mark values, especially if the frontend is unable (or unwilling) to recycle mark values. For mark idnums we should use the largest unsigned integer type available, hoping that will be at least 64 bits when we are compiled as a 64 bit executable. This way we may consume huge amounts of memory storing our mark table, but we'll at least be able to process the entire import without failing. Signed-off-by: Shawn O. Pearce --- fast-import.c | 91 ++++++++++++++++++++++++----------------------- git-compat-util.h | 1 + 2 files changed, 48 insertions(+), 44 deletions(-) diff --git a/fast-import.c b/fast-import.c index 57d857c386..ebffa7c904 100644 --- a/fast-import.c +++ b/fast-import.c @@ -215,18 +215,20 @@ struct hash_list unsigned char sha1[20]; }; -/* Stats and misc. counters */ +/* Configured limits on output */ static unsigned long max_depth = 10; -static unsigned long max_objects = -1; static unsigned long max_packsize = -1; -static unsigned long alloc_count; +static uintmax_t max_objects = -1; + +/* Stats and misc. counters */ +static uintmax_t alloc_count; +static uintmax_t object_count; +static uintmax_t marks_set_count; +static uintmax_t object_count_by_type[1 << TYPE_BITS]; +static uintmax_t duplicate_count_by_type[1 << TYPE_BITS]; +static uintmax_t delta_count_by_type[1 << TYPE_BITS]; static unsigned long branch_count; static unsigned long branch_load_count; -static unsigned long object_count; -static unsigned long marks_set_count; -static unsigned long object_count_by_type[1 << TYPE_BITS]; -static unsigned long duplicate_count_by_type[1 << TYPE_BITS]; -static unsigned long delta_count_by_type[1 << TYPE_BITS]; /* Memory pools */ static size_t mem_pool_alloc = 2*1024*1024 - sizeof(struct mem_pool); @@ -279,7 +281,7 @@ static struct tag *last_tag; /* Input stream parsing */ static struct strbuf command_buf; -static unsigned long next_mark; +static uintmax_t next_mark; static struct dbuf new_data; static FILE* branch_log; @@ -406,7 +408,7 @@ static void size_dbuf(struct dbuf *b, size_t maxlen) b->buffer = xmalloc(b->capacity); } -static void insert_mark(unsigned long idnum, struct object_entry *oe) +static void insert_mark(uintmax_t idnum, struct object_entry *oe) { struct mark_set *s = marks; while ((idnum >> s->shift) >= 1024) { @@ -416,7 +418,7 @@ static void insert_mark(unsigned long idnum, struct object_entry *oe) marks = s; } while (s->shift) { - unsigned long i = idnum >> s->shift; + uintmax_t i = idnum >> s->shift; idnum -= i << s->shift; if (!s->data.sets[i]) { s->data.sets[i] = pool_calloc(1, sizeof(struct mark_set)); @@ -429,14 +431,14 @@ static void insert_mark(unsigned long idnum, struct object_entry *oe) s->data.marked[idnum] = oe; } -static struct object_entry* find_mark(unsigned long idnum) +static struct object_entry* find_mark(uintmax_t idnum) { - unsigned long orig_idnum = idnum; + uintmax_t orig_idnum = idnum; struct mark_set *s = marks; struct object_entry *oe = NULL; if ((idnum >> s->shift) < 1024) { while (s && s->shift) { - unsigned long i = idnum >> s->shift; + uintmax_t i = idnum >> s->shift; idnum -= i << s->shift; s = s->data.sets[i]; } @@ -444,7 +446,7 @@ static struct object_entry* find_mark(unsigned long idnum) oe = s->data.marked[idnum]; } if (!oe) - die("mark :%lu not declared", orig_idnum); + die("mark :%ju not declared", orig_idnum); return oe; } @@ -781,7 +783,7 @@ static int store_object( size_t datlen, struct last_object *last, unsigned char *sha1out, - unsigned long mark) + uintmax_t mark) { void *out, *delta; struct object_entry *e; @@ -1225,10 +1227,10 @@ static void dump_tags() } static void dump_marks_helper(FILE *f, - unsigned long base, + uintmax_t base, struct mark_set *m) { - int k; + uintmax_t k; if (m->shift) { for (k = 0; k < 1024; k++) { if (m->data.sets[k]) @@ -1238,7 +1240,7 @@ static void dump_marks_helper(FILE *f, } else { for (k = 0; k < 1024; k++) { if (m->data.marked[k]) - fprintf(f, ":%lu %s\n", base + k, + fprintf(f, ":%ju %s\n", base + k, sha1_to_hex(m->data.marked[k]->sha1)); } } @@ -1262,7 +1264,7 @@ static void read_next_command() static void cmd_mark() { if (!strncmp("mark :", command_buf.buf, 6)) { - next_mark = strtoul(command_buf.buf + 6, NULL, 10); + next_mark = strtoumax(command_buf.buf + 6, NULL, 10); read_next_command(); } else @@ -1375,7 +1377,7 @@ static void file_change_m(struct branch *b) if (*p == ':') { char *x; - oe = find_mark(strtoul(p + 1, &x, 10)); + oe = find_mark(strtoumax(p + 1, &x, 10)); hashcpy(sha1, oe->sha1); p = x; } else { @@ -1458,12 +1460,12 @@ static void cmd_from(struct branch *b) hashcpy(b->branch_tree.versions[0].sha1, t); hashcpy(b->branch_tree.versions[1].sha1, t); } else if (*from == ':') { - unsigned long idnum = strtoul(from + 1, NULL, 10); + uintmax_t idnum = strtoumax(from + 1, NULL, 10); struct object_entry *oe = find_mark(idnum); unsigned long size; char *buf; if (oe->type != OBJ_COMMIT) - die("Mark :%lu not a commit", idnum); + die("Mark :%ju not a commit", idnum); hashcpy(b->sha1, oe->sha1); buf = gfi_unpack_entry(oe, &size); if (!buf || size < 46) @@ -1521,10 +1523,10 @@ static struct hash_list* cmd_merge(unsigned int *count) if (s) hashcpy(n->sha1, s->sha1); else if (*from == ':') { - unsigned long idnum = strtoul(from + 1, NULL, 10); + uintmax_t idnum = strtoumax(from + 1, NULL, 10); struct object_entry *oe = find_mark(idnum); if (oe->type != OBJ_COMMIT) - die("Mark :%lu not a commit", idnum); + die("Mark :%ju not a commit", idnum); hashcpy(n->sha1, oe->sha1); } else if (get_sha1(from, n->sha1)) die("Invalid ref name or SHA1 expression: %s", from); @@ -1650,7 +1652,7 @@ static void cmd_new_commit() fputc('"', branch_log); } else fprintf(branch_log, "%s", b->name); - fprintf(branch_log," :%lu %s\n",next_mark,sha1_to_hex(b->sha1)); + fprintf(branch_log," :%ju %s\n",next_mark,sha1_to_hex(b->sha1)); } } @@ -1665,7 +1667,7 @@ static void cmd_new_tag() void *msg; size_t msglen; struct tag *t; - unsigned long from_mark = 0; + uintmax_t from_mark = 0; unsigned char sha1[20]; /* Obtain the new tag name from the rest of our command */ @@ -1704,10 +1706,10 @@ static void cmd_new_tag() if (s) { hashcpy(sha1, s->sha1); } else if (*from == ':') { - from_mark = strtoul(from + 1, NULL, 10); + from_mark = strtoumax(from + 1, NULL, 10); struct object_entry *oe = find_mark(from_mark); if (oe->type != OBJ_COMMIT) - die("Mark :%lu not a commit", from_mark); + die("Mark :%ju not a commit", from_mark); hashcpy(sha1, oe->sha1); } else if (!get_sha1(from, sha1)) { unsigned long size; @@ -1758,7 +1760,7 @@ static void cmd_new_tag() fputc('"', branch_log); } else fprintf(branch_log, "%s", t->name); - fprintf(branch_log," :%lu %s\n",from_mark,sha1_to_hex(t->sha1)); + fprintf(branch_log," :%ju %s\n",from_mark,sha1_to_hex(t->sha1)); } } @@ -1806,8 +1808,8 @@ static const char fast_import_usage[] = int main(int argc, const char **argv) { int i; - unsigned long est_obj_cnt = object_entry_alloc; - unsigned long duplicate_count; + uintmax_t est_obj_cnt = object_entry_alloc; + uintmax_t duplicate_count; setup_ident(); git_config(git_default_config); @@ -1818,11 +1820,11 @@ int main(int argc, const char **argv) if (*a != '-' || !strcmp(a, "--")) break; else if (!strncmp(a, "--objects=", 10)) - est_obj_cnt = strtoul(a + 10, NULL, 0); + est_obj_cnt = strtoumax(a + 10, NULL, 0); else if (!strncmp(a, "--max-objects-per-pack=", 23)) - max_objects = strtoul(a + 23, NULL, 0); + max_objects = strtoumax(a + 23, NULL, 0); else if (!strncmp(a, "--max-pack-size=", 16)) - max_packsize = strtoul(a + 16, NULL, 0) * 1024 * 1024; + max_packsize = strtoumax(a + 16, NULL, 0) * 1024 * 1024; else if (!strncmp(a, "--depth=", 8)) max_depth = strtoul(a + 8, NULL, 0); else if (!strncmp(a, "--active-branches=", 18)) @@ -1875,23 +1877,24 @@ int main(int argc, const char **argv) if (branch_log) fclose(branch_log); + duplicate_count = 0; for (i = 0; i < ARRAY_SIZE(duplicate_count_by_type); i++) duplicate_count += duplicate_count_by_type[i]; fprintf(stderr, "%s statistics:\n", argv[0]); fprintf(stderr, "---------------------------------------------------------------------\n"); - fprintf(stderr, "Alloc'd objects: %10lu (%10lu overflow )\n", alloc_count, alloc_count - est_obj_cnt); - fprintf(stderr, "Total objects: %10lu (%10lu duplicates )\n", object_count, duplicate_count); - fprintf(stderr, " blobs : %10lu (%10lu duplicates %10lu deltas)\n", object_count_by_type[OBJ_BLOB], duplicate_count_by_type[OBJ_BLOB], delta_count_by_type[OBJ_BLOB]); - fprintf(stderr, " trees : %10lu (%10lu duplicates %10lu deltas)\n", object_count_by_type[OBJ_TREE], duplicate_count_by_type[OBJ_TREE], delta_count_by_type[OBJ_TREE]); - fprintf(stderr, " commits: %10lu (%10lu duplicates %10lu deltas)\n", object_count_by_type[OBJ_COMMIT], duplicate_count_by_type[OBJ_COMMIT], delta_count_by_type[OBJ_COMMIT]); - fprintf(stderr, " tags : %10lu (%10lu duplicates %10lu deltas)\n", object_count_by_type[OBJ_TAG], duplicate_count_by_type[OBJ_TAG], delta_count_by_type[OBJ_TAG]); + fprintf(stderr, "Alloc'd objects: %10ju (%10ju overflow )\n", alloc_count, alloc_count - est_obj_cnt); + fprintf(stderr, "Total objects: %10ju (%10ju duplicates )\n", object_count, duplicate_count); + fprintf(stderr, " blobs : %10ju (%10ju duplicates %10ju deltas)\n", object_count_by_type[OBJ_BLOB], duplicate_count_by_type[OBJ_BLOB], delta_count_by_type[OBJ_BLOB]); + fprintf(stderr, " trees : %10ju (%10ju duplicates %10ju deltas)\n", object_count_by_type[OBJ_TREE], duplicate_count_by_type[OBJ_TREE], delta_count_by_type[OBJ_TREE]); + fprintf(stderr, " commits: %10ju (%10ju duplicates %10ju deltas)\n", object_count_by_type[OBJ_COMMIT], duplicate_count_by_type[OBJ_COMMIT], delta_count_by_type[OBJ_COMMIT]); + fprintf(stderr, " tags : %10ju (%10ju duplicates %10ju deltas)\n", object_count_by_type[OBJ_TAG], duplicate_count_by_type[OBJ_TAG], delta_count_by_type[OBJ_TAG]); fprintf(stderr, "Total branches: %10lu (%10lu loads )\n", branch_count, branch_load_count); - fprintf(stderr, " marks: %10u (%10lu unique )\n", (1 << marks->shift) * 1024, marks_set_count); + fprintf(stderr, " marks: %10ju (%10ju unique )\n", (((uintmax_t)1) << marks->shift) * 1024, marks_set_count); fprintf(stderr, " atoms: %10u\n", atom_cnt); - fprintf(stderr, "Memory total: %10lu KiB\n", (total_allocd + alloc_count*sizeof(struct object_entry))/1024); + fprintf(stderr, "Memory total: %10ju KiB\n", (total_allocd + alloc_count*sizeof(struct object_entry))/1024); fprintf(stderr, " pools: %10lu KiB\n", total_allocd/1024); - fprintf(stderr, " objects: %10lu KiB\n", (alloc_count*sizeof(struct object_entry))/1024); + fprintf(stderr, " objects: %10ju KiB\n", (alloc_count*sizeof(struct object_entry))/1024); fprintf(stderr, "---------------------------------------------------------------------\n"); pack_report(); fprintf(stderr, "---------------------------------------------------------------------\n"); diff --git a/git-compat-util.h b/git-compat-util.h index 8781e8e22d..614583e56a 100644 --- a/git-compat-util.h +++ b/git-compat-util.h @@ -28,6 +28,7 @@ #include #include #include +#include #include #include #include From 6cf092619376f5bf60987f146d142497ded2f718 Mon Sep 17 00:00:00 2001 From: "Shawn O. Pearce" Date: Tue, 16 Jan 2007 00:35:41 -0500 Subject: [PATCH 50/81] Replace redundant yread() with read_in_full() in fast-import. Prior to git having read_in_full() fast-import used its own private function yread to perform the header reading task. No sense in keeping that around now that read_in_full is a public, stable function. Signed-off-by: Shawn O. Pearce --- fast-import.c | 16 ++-------------- 1 file changed, 2 insertions(+), 14 deletions(-) diff --git a/fast-import.c b/fast-import.c index ebffa7c904..938707c5bd 100644 --- a/fast-import.c +++ b/fast-import.c @@ -590,19 +590,6 @@ static void release_tree_entry(struct tree_entry *e) avail_tree_entry = e; } -static void yread(int fd, void *buffer, size_t length) -{ - ssize_t ret = 0; - while (ret < length) { - ssize_t size = xread(fd, (char *) buffer + ret, length - ret); - if (!size) - die("Read from descriptor %i: end of stream", fd); - if (size < 0) - die("Read from descriptor %i: %s", fd, strerror(errno)); - ret += size; - } -} - static void start_packfile() { struct packed_git *p; @@ -642,7 +629,8 @@ static void fixup_header_footer() die("Failed seeking to start: %s", strerror(errno)); SHA1_Init(&c); - yread(pack_fd, hdr, 8); + if (read_in_full(pack_fd, hdr, 8) != 8) + die("Unable to reread header of %s", pack_data->pack_name); SHA1_Update(&c, hdr, 8); cnt = htonl(object_count); From 09543c96bbe41d312bc002c293a193aa328c839d Mon Sep 17 00:00:00 2001 From: "Shawn O. Pearce" Date: Tue, 16 Jan 2007 00:44:48 -0500 Subject: [PATCH 51/81] Reuse sha1 in packed_git in fast-import. Rather than maintaing our own packfile level sha1 variable we can make use of the one already available in struct packed_git. Its meant for the SHA1 of the index but it can also hold the SHA1 of the packfile itself between final checksumming of the packfile and creation of the index. Signed-off-by: Shawn O. Pearce --- fast-import.c | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/fast-import.c b/fast-import.c index 938707c5bd..5767e808c6 100644 --- a/fast-import.c +++ b/fast-import.c @@ -248,7 +248,6 @@ static struct packed_git *pack_data; static struct packed_git **all_packs; static int pack_fd; static unsigned long pack_size; -static unsigned char pack_sha1[20]; /* Table of objects we've written. */ static unsigned int object_entry_alloc = 5000; @@ -646,8 +645,8 @@ static void fixup_header_footer() } free(buf); - SHA1_Final(pack_sha1, &c); - write_or_die(pack_fd, pack_sha1, sizeof(pack_sha1)); + SHA1_Final(pack_data->sha1, &c); + write_or_die(pack_fd, pack_data->sha1, sizeof(pack_data->sha1)); } static int oecmp (const void *a_, const void *b_) @@ -697,8 +696,8 @@ static void write_index(const char *idx_name) sha1write(f, &offset, 4); sha1write(f, (*c)->sha1, sizeof((*c)->sha1)); } - sha1write(f, pack_sha1, sizeof(pack_sha1)); - sha1close(f, NULL, 1); + sha1write(f, pack_data->sha1, sizeof(pack_data->sha1)); + sha1close(f, pack_data->sha1, 1); free(idx); } From 8455e48476634eeff6fd2cd4f245cadfef14bbc8 Mon Sep 17 00:00:00 2001 From: "Shawn O. Pearce" Date: Tue, 16 Jan 2007 01:15:31 -0500 Subject: [PATCH 52/81] Use .keep files in fast-import during processing. Because fast-import automatically updates all references (heads and tags) at the end of its run the repository is corrupt unless the objects are available in the .git/objects/pack directory prior to the refs being modified. The easiest way to ensure that is true is to move the packfile and its associated index directly into the .git/objects/pack directory as soon as we have finished output to it. But the only safe way to do this is to create the a temporary .keep file for that pack, so we use the same tricks that index-pack uses when its being invoked by receive-pack. Signed-off-by: Shawn O. Pearce --- fast-import.c | 91 ++++++++++++++++++++++++++++++++++++++++----------- 1 file changed, 71 insertions(+), 20 deletions(-) diff --git a/fast-import.c b/fast-import.c index 5767e808c6..393020504a 100644 --- a/fast-import.c +++ b/fast-import.c @@ -241,9 +241,7 @@ static unsigned int atom_cnt; static struct atom_str **atom_table; /* The .pack file being generated */ -static const char *base_name; static unsigned int pack_id; -static char *idx_name; static struct packed_git *pack_data; static struct packed_git **all_packs; static int pack_fd; @@ -591,17 +589,17 @@ static void release_tree_entry(struct tree_entry *e) static void start_packfile() { + static char tmpfile[PATH_MAX]; struct packed_git *p; struct pack_header hdr; - idx_name = xmalloc(strlen(base_name) + 11); - p = xcalloc(1, sizeof(*p) + strlen(base_name) + 13); - sprintf(p->pack_name, "%s%5.5i.pack", base_name, pack_id + 1); - sprintf(idx_name, "%s%5.5i.idx", base_name, pack_id + 1); - - pack_fd = open(p->pack_name, O_RDWR|O_CREAT|O_EXCL, 0666); + snprintf(tmpfile, sizeof(tmpfile), + "%s/pack_XXXXXX", get_object_directory()); + pack_fd = mkstemp(tmpfile); if (pack_fd < 0) - die("Can't create %s: %s", p->pack_name, strerror(errno)); + die("Can't create %s: %s", tmpfile, strerror(errno)); + p = xcalloc(1, sizeof(*p) + strlen(tmpfile) + 2); + strcpy(p->pack_name, tmpfile); p->pack_fd = pack_fd; hdr.hdr_signature = htonl(PACK_SIGNATURE); @@ -656,13 +654,15 @@ static int oecmp (const void *a_, const void *b_) return hashcmp(a->sha1, b->sha1); } -static void write_index(const char *idx_name) +static char* create_index() { + static char tmpfile[PATH_MAX]; + SHA_CTX ctx; struct sha1file *f; struct object_entry **idx, **c, **last, *e; struct object_entry_pool *o; unsigned int array[256]; - int i; + int i, idx_fd; /* Build the sorted table of object IDs. */ idx = xmalloc(object_count * sizeof(struct object_entry*)); @@ -689,16 +689,68 @@ static void write_index(const char *idx_name) c = next; } - f = sha1create("%s", idx_name); + snprintf(tmpfile, sizeof(tmpfile), + "%s/index_XXXXXX", get_object_directory()); + idx_fd = mkstemp(tmpfile); + if (idx_fd < 0) + die("Can't create %s: %s", tmpfile, strerror(errno)); + f = sha1fd(idx_fd, tmpfile); sha1write(f, array, 256 * sizeof(int)); + SHA1_Init(&ctx); for (c = idx; c != last; c++) { unsigned int offset = htonl((*c)->offset); sha1write(f, &offset, 4); sha1write(f, (*c)->sha1, sizeof((*c)->sha1)); + SHA1_Update(&ctx, (*c)->sha1, 20); } sha1write(f, pack_data->sha1, sizeof(pack_data->sha1)); - sha1close(f, pack_data->sha1, 1); + sha1close(f, NULL, 1); free(idx); + SHA1_Final(pack_data->sha1, &ctx); + return tmpfile; +} + +static char* keep_pack(char *curr_index_name) +{ + static char name[PATH_MAX]; + static char *keep_msg = "fast-import"; + int keep_fd; + + chmod(pack_data->pack_name, 0444); + chmod(curr_index_name, 0444); + + snprintf(name, sizeof(name), "%s/pack/pack-%s.keep", + get_object_directory(), sha1_to_hex(pack_data->sha1)); + keep_fd = open(name, O_RDWR|O_CREAT|O_EXCL, 0600); + if (keep_fd < 0) + die("cannot create keep file"); + write(keep_fd, keep_msg, strlen(keep_msg)); + close(keep_fd); + + snprintf(name, sizeof(name), "%s/pack/pack-%s.pack", + get_object_directory(), sha1_to_hex(pack_data->sha1)); + if (move_temp_to_file(pack_data->pack_name, name)) + die("cannot store pack file"); + printf("%s\n", name); + + snprintf(name, sizeof(name), "%s/pack/pack-%s.idx", + get_object_directory(), sha1_to_hex(pack_data->sha1)); + if (move_temp_to_file(curr_index_name, name)) + die("cannot store index file"); + return name; +} + +static void unkeep_all_packs() +{ + static char name[PATH_MAX]; + int k; + + for (k = 0; k < pack_id; k++) { + struct packed_git *p = all_packs[k]; + snprintf(name, sizeof(name), "%s/pack/pack-%s.keep", + get_object_directory(), sha1_to_hex(p->sha1)); + unlink(name); + } } static void end_packfile() @@ -706,10 +758,10 @@ static void end_packfile() struct packed_git *old_p = pack_data, *new_p; if (object_count) { + char *idx_name; + fixup_header_footer(); - write_index(idx_name); - fprintf(stdout, "%s\n", old_p->pack_name); - fflush(stdout); + idx_name = keep_pack(create_index()); /* Register the packfile with core git's machinary. */ new_p = add_packed_git(idx_name, strlen(idx_name), 1); @@ -725,7 +777,6 @@ static void end_packfile() unlink(old_p->pack_name); } free(old_p); - free(idx_name); /* We can't carry a delta across packfiles. */ free(last_blob.data); @@ -1790,7 +1841,7 @@ static void cmd_checkpoint() } static const char fast_import_usage[] = -"git-fast-import [--objects=n] [--depth=n] [--active-branches=n] [--export-marks=marks.file] [--branch-log=log] temp.pack"; +"git-fast-import [--objects=n] [--depth=n] [--active-branches=n] [--export-marks=marks.file] [--branch-log=log]"; int main(int argc, const char **argv) { @@ -1826,9 +1877,8 @@ int main(int argc, const char **argv) else die("unknown option %s", a); } - if ((i+1) != argc) + if (i != argc) usage(fast_import_usage); - base_name = argv[i]; alloc_objects(est_obj_cnt); strbuf_init(&command_buf); @@ -1860,6 +1910,7 @@ int main(int argc, const char **argv) dump_branches(); dump_tags(); + unkeep_all_packs(); dump_marks(); if (branch_log) fclose(branch_log); From 1280158738333109cf0ada2fb378db2cdf7296ad Mon Sep 17 00:00:00 2001 From: "Shawn O. Pearce" Date: Tue, 16 Jan 2007 01:17:47 -0500 Subject: [PATCH 53/81] Ensure we close the packfile after creating it in fast-import. Because we are renaming the packfile into its file destination we need to be sure its not open when the rename is called, otherwise some operating systems (e.g. Windows) may prevent the rename from occurring. Signed-off-by: Shawn O. Pearce --- fast-import.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/fast-import.c b/fast-import.c index 393020504a..a9cf22dfe2 100644 --- a/fast-import.c +++ b/fast-import.c @@ -645,6 +645,7 @@ static void fixup_header_footer() SHA1_Final(pack_data->sha1, &c); write_or_die(pack_fd, pack_data->sha1, sizeof(pack_data->sha1)); + close(pack_fd); } static int oecmp (const void *a_, const void *b_) @@ -768,14 +769,11 @@ static void end_packfile() if (!new_p) die("core git rejected index %s", idx_name); new_p->windows = old_p->windows; - new_p->pack_fd = old_p->pack_fd; all_packs[pack_id++] = new_p; install_packed_git(new_p); } - else { - close(pack_fd); + else unlink(old_p->pack_name); - } free(old_p); /* We can't carry a delta across packfiles. */ From 0fcbcae75372f96539ba0f9598112c417d81ab0d Mon Sep 17 00:00:00 2001 From: "Shawn O. Pearce" Date: Tue, 16 Jan 2007 01:20:57 -0500 Subject: [PATCH 54/81] Remove unnecessary pack_fd global in fast-import. Much like the pack_sha1 the pack_fd is an unnecessary global variable, we already have the fd stored in our struct packed_git *pack_data so that the core library functions in sha1_file.c are able to lookup and decompress object data that we have previously written. Keeping an extra copy of this value in our own variable is just a hold-over from earlier versions of fast-import and is now completely unnecessary. Signed-off-by: Shawn O. Pearce --- fast-import.c | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/fast-import.c b/fast-import.c index a9cf22dfe2..281b8f6a5e 100644 --- a/fast-import.c +++ b/fast-import.c @@ -244,7 +244,6 @@ static struct atom_str **atom_table; static unsigned int pack_id; static struct packed_git *pack_data; static struct packed_git **all_packs; -static int pack_fd; static unsigned long pack_size; /* Table of objects we've written. */ @@ -592,6 +591,7 @@ static void start_packfile() static char tmpfile[PATH_MAX]; struct packed_git *p; struct pack_header hdr; + int pack_fd; snprintf(tmpfile, sizeof(tmpfile), "%s/pack_XXXXXX", get_object_directory()); @@ -605,7 +605,7 @@ static void start_packfile() hdr.hdr_signature = htonl(PACK_SIGNATURE); hdr.hdr_version = htonl(2); hdr.hdr_entries = 0; - write_or_die(pack_fd, &hdr, sizeof(hdr)); + write_or_die(p->pack_fd, &hdr, sizeof(hdr)); pack_data = p; pack_size = sizeof(hdr); @@ -617,6 +617,7 @@ static void start_packfile() static void fixup_header_footer() { + int pack_fd = pack_data->pack_fd; SHA_CTX c; char hdr[8]; unsigned long cnt; @@ -912,23 +913,23 @@ static int store_object( last->depth++; hdrlen = encode_header(OBJ_OFS_DELTA, deltalen, hdr); - write_or_die(pack_fd, hdr, hdrlen); + write_or_die(pack_data->pack_fd, hdr, hdrlen); pack_size += hdrlen; hdr[pos] = ofs & 127; while (ofs >>= 7) hdr[--pos] = 128 | (--ofs & 127); - write_or_die(pack_fd, hdr + pos, sizeof(hdr) - pos); + write_or_die(pack_data->pack_fd, hdr + pos, sizeof(hdr) - pos); pack_size += sizeof(hdr) - pos; } else { if (last) last->depth = 0; hdrlen = encode_header(type, datlen, hdr); - write_or_die(pack_fd, hdr, hdrlen); + write_or_die(pack_data->pack_fd, hdr, hdrlen); pack_size += hdrlen; } - write_or_die(pack_fd, out, s.total_out); + write_or_die(pack_data->pack_fd, out, s.total_out); pack_size += s.total_out; free(out); From eec11c24840bfc5293a80fed3c3b1e5bc10ac453 Mon Sep 17 00:00:00 2001 From: "Shawn O. Pearce" Date: Tue, 16 Jan 2007 04:25:12 -0500 Subject: [PATCH 55/81] Correct max_packsize default in fast-import. Apparently amd64 has defined 'unsigned long' to be a 64 bit value, which means -1 was way over the 4 GiB packfile limit. Whoops. Signed-off-by: Shawn O. Pearce --- fast-import.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fast-import.c b/fast-import.c index 281b8f6a5e..8342314bb0 100644 --- a/fast-import.c +++ b/fast-import.c @@ -217,7 +217,7 @@ struct hash_list /* Configured limits on output */ static unsigned long max_depth = 10; -static unsigned long max_packsize = -1; +static unsigned long max_packsize = (1LL << 32) - 1; static uintmax_t max_objects = -1; /* Stats and misc. counters */ From a7ddc48765ff2e4f6601ea146cba4283a342e0b1 Mon Sep 17 00:00:00 2001 From: "Shawn O. Pearce" Date: Tue, 16 Jan 2007 04:55:41 -0500 Subject: [PATCH 56/81] Correct object_count type and stat output in fast-import. Since object_count is limited to 'unsigned long' (really an unsigned 32 bit integer value) by the pack file format we may as well use exactly that type here in fast-import for that counter. An earlier change by me incorrectly made it uintmax_t. But since object_count is a counter for the current packfile only, we don't want to output its value at the end. Instead we should sum up the individual type counters and report that total, as that will cover all of the packfiles. Signed-off-by: Shawn O. Pearce --- fast-import.c | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/fast-import.c b/fast-import.c index 8342314bb0..3992af5f25 100644 --- a/fast-import.c +++ b/fast-import.c @@ -222,11 +222,11 @@ static uintmax_t max_objects = -1; /* Stats and misc. counters */ static uintmax_t alloc_count; -static uintmax_t object_count; static uintmax_t marks_set_count; static uintmax_t object_count_by_type[1 << TYPE_BITS]; static uintmax_t duplicate_count_by_type[1 << TYPE_BITS]; static uintmax_t delta_count_by_type[1 << TYPE_BITS]; +static unsigned long object_count; static unsigned long branch_count; static unsigned long branch_load_count; @@ -1846,7 +1846,7 @@ int main(int argc, const char **argv) { int i; uintmax_t est_obj_cnt = object_entry_alloc; - uintmax_t duplicate_count; + uintmax_t total_count, duplicate_count; setup_ident(); git_config(git_default_config); @@ -1914,6 +1914,9 @@ int main(int argc, const char **argv) if (branch_log) fclose(branch_log); + total_count = 0; + for (i = 0; i < ARRAY_SIZE(object_count_by_type); i++) + total_count += object_count_by_type[i]; duplicate_count = 0; for (i = 0; i < ARRAY_SIZE(duplicate_count_by_type); i++) duplicate_count += duplicate_count_by_type[i]; @@ -1921,7 +1924,7 @@ int main(int argc, const char **argv) fprintf(stderr, "%s statistics:\n", argv[0]); fprintf(stderr, "---------------------------------------------------------------------\n"); fprintf(stderr, "Alloc'd objects: %10ju (%10ju overflow )\n", alloc_count, alloc_count - est_obj_cnt); - fprintf(stderr, "Total objects: %10ju (%10ju duplicates )\n", object_count, duplicate_count); + fprintf(stderr, "Total objects: %10ju (%10ju duplicates )\n", total_count, duplicate_count); fprintf(stderr, " blobs : %10ju (%10ju duplicates %10ju deltas)\n", object_count_by_type[OBJ_BLOB], duplicate_count_by_type[OBJ_BLOB], delta_count_by_type[OBJ_BLOB]); fprintf(stderr, " trees : %10ju (%10ju duplicates %10ju deltas)\n", object_count_by_type[OBJ_TREE], duplicate_count_by_type[OBJ_TREE], delta_count_by_type[OBJ_TREE]); fprintf(stderr, " commits: %10ju (%10ju duplicates %10ju deltas)\n", object_count_by_type[OBJ_COMMIT], duplicate_count_by_type[OBJ_COMMIT], delta_count_by_type[OBJ_COMMIT]); From 2369ed79071edf0f040eb2c280e1e2cf9a883bb9 Mon Sep 17 00:00:00 2001 From: "Shawn O. Pearce" Date: Tue, 16 Jan 2007 16:18:44 -0500 Subject: [PATCH 57/81] Print out the edge commits for each packfile in fast-import. To help callers repack very large repositories into a series of packfiles fast-import now outputs the last commits/tags it wrote to a packfile when it prints out the packfile name. This information can be feed to pack-objects --revs to repack. For the first pack of an initial import this is pretty easy (just feed those SHA1s on stdin) but for subsequent packs you want to feed the subsequent pack's final SHA1s but also all prior pack's SHA1s prefixed with the negation operator. This way the prior pack's data does not get included into the subsequent pack. Signed-off-by: Shawn O. Pearce --- fast-import.c | 26 ++++++++++++++++++++++++-- 1 file changed, 24 insertions(+), 2 deletions(-) diff --git a/fast-import.c b/fast-import.c index 3992af5f25..84dfde9d2f 100644 --- a/fast-import.c +++ b/fast-import.c @@ -193,6 +193,7 @@ struct branch const char *name; unsigned long last_commit; struct tree_entry branch_tree; + unsigned int pack_id; unsigned char sha1[20]; }; @@ -200,6 +201,7 @@ struct tag { struct tag *next_tag; const char *name; + unsigned int pack_id; unsigned char sha1[20]; }; @@ -733,7 +735,6 @@ static char* keep_pack(char *curr_index_name) get_object_directory(), sha1_to_hex(pack_data->sha1)); if (move_temp_to_file(pack_data->pack_name, name)) die("cannot store pack file"); - printf("%s\n", name); snprintf(name, sizeof(name), "%s/pack/pack-%s.idx", get_object_directory(), sha1_to_hex(pack_data->sha1)); @@ -761,6 +762,9 @@ static void end_packfile() if (object_count) { char *idx_name; + int i; + struct branch *b; + struct tag *t; fixup_header_footer(); idx_name = keep_pack(create_index()); @@ -770,8 +774,24 @@ static void end_packfile() if (!new_p) die("core git rejected index %s", idx_name); new_p->windows = old_p->windows; - all_packs[pack_id++] = new_p; + all_packs[pack_id] = new_p; install_packed_git(new_p); + + /* Print the boundary */ + fprintf(stdout, "%s:", new_p->pack_name); + for (i = 0; i < branch_table_sz; i++) { + for (b = branch_table[i]; b; b = b->table_next_branch) { + if (b->pack_id == pack_id) + fprintf(stdout, " %s", sha1_to_hex(b->sha1)); + } + } + for (t = first_tag; t; t = t->next_tag) { + if (t->pack_id == pack_id) + fprintf(stdout, " %s", sha1_to_hex(t->sha1)); + } + fputc('\n', stdout); + + pack_id++; } else unlink(old_p->pack_name); @@ -1679,6 +1699,7 @@ static void cmd_new_commit() new_data.buffer, sp - (char*)new_data.buffer, NULL, b->sha1, next_mark); b->last_commit = object_count_by_type[OBJ_COMMIT]; + b->pack_id = pack_id; if (branch_log) { int need_dq = quote_c_style(b->name, NULL, NULL, 0); @@ -1787,6 +1808,7 @@ static void cmd_new_tag() store_object(OBJ_TAG, new_data.buffer, sp - (char*)new_data.buffer, NULL, t->sha1, 0); + t->pack_id = pack_id; if (branch_log) { int need_dq = quote_c_style(t->name, NULL, NULL, 0); From 2104838bf9b97066f21e4c32efdfa424d41e6b98 Mon Sep 17 00:00:00 2001 From: "Shawn O. Pearce" Date: Wed, 17 Jan 2007 00:33:18 -0500 Subject: [PATCH 58/81] Corrected BNF input documentation for fast-import. Now that fast-import uses uintmax_t (the largest available unsigned integer type) for marks we don't want to say its an unsigned 32 bit integer in ASCII base 10 notation. It could be much larger, especially on 64 bit systems, and especially if a frontend uses a very large number of marks (1 per file revision on a very, very large import). Signed-off-by: Shawn O. Pearce --- fast-import.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/fast-import.c b/fast-import.c index 84dfde9d2f..f1b26d103f 100644 --- a/fast-import.c +++ b/fast-import.c @@ -72,6 +72,7 @@ Format of STDIN stream: path_str ::= path | '"' quoted(path) '"' ; declen ::= # unsigned 32 bit value, ascii base10 notation; + bigint ::= # unsigned integer value, ascii base10 notation; binary_data ::= # file content, not interpreted; sp ::= # ASCII space character; @@ -81,7 +82,7 @@ Format of STDIN stream: # an idnum. This is to distinguish it from a ref or tag name as # GIT does not permit ':' in ref or tag strings. # - idnum ::= ':' declen; + idnum ::= ':' bigint; path ::= # GIT style file path, e.g. "a/b/c"; ref ::= # GIT ref name, e.g. "refs/heads/MOZ_GECKO_EXPERIMENT"; tag ::= # GIT tag name, e.g. "FIREFOX_1_5"; From 6f64f6d9d2b12cdae1648cbf536685c888f3b981 Mon Sep 17 00:00:00 2001 From: "Shawn O. Pearce" Date: Wed, 17 Jan 2007 00:57:23 -0500 Subject: [PATCH 59/81] Correct a few types to be unsigned in fast-import. The length of an atom string cannot be negative. So make it explicit and declare it as an unsigned value. The shift width in a mark table node also cannot be negative. I'm also moving it to after the pointer arrays to prevent any possible alignment problems on a 64 bit system. Signed-off-by: Shawn O. Pearce --- fast-import.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/fast-import.c b/fast-import.c index f1b26d103f..2c500d6be3 100644 --- a/fast-import.c +++ b/fast-import.c @@ -130,11 +130,11 @@ struct object_entry_pool struct mark_set { - int shift; union { struct object_entry *marked[1024]; struct mark_set *sets[1024]; } data; + unsigned int shift; }; struct last_object @@ -157,7 +157,7 @@ struct mem_pool struct atom_str { struct atom_str *next_atom; - int str_len; + unsigned int str_len; char str_dat[FLEX_ARRAY]; /* more */ }; @@ -192,8 +192,8 @@ struct branch struct branch *table_next_branch; struct branch *active_next_branch; const char *name; - unsigned long last_commit; struct tree_entry branch_tree; + unsigned long last_commit; unsigned int pack_id; unsigned char sha1[20]; }; From fd99224eec67d89f970b207e7db031b7c58e812e Mon Sep 17 00:00:00 2001 From: "Shawn O. Pearce" Date: Wed, 17 Jan 2007 01:47:25 -0500 Subject: [PATCH 60/81] Declare no-arg functions as (void) in fast-import. Apparently the git convention is to declare any function which takes no arguments as taking void. I did not do this during the early fast-import development, but should have. Signed-off-by: Shawn O. Pearce --- fast-import.c | 36 ++++++++++++++++++------------------ 1 file changed, 18 insertions(+), 18 deletions(-) diff --git a/fast-import.c b/fast-import.c index 2c500d6be3..84f855fb8e 100644 --- a/fast-import.c +++ b/fast-import.c @@ -561,7 +561,7 @@ static struct tree_content* grow_tree_content( return r; } -static struct tree_entry* new_tree_entry() +static struct tree_entry* new_tree_entry(void) { struct tree_entry *e; @@ -589,7 +589,7 @@ static void release_tree_entry(struct tree_entry *e) avail_tree_entry = e; } -static void start_packfile() +static void start_packfile(void) { static char tmpfile[PATH_MAX]; struct packed_git *p; @@ -618,7 +618,7 @@ static void start_packfile() all_packs[pack_id] = p; } -static void fixup_header_footer() +static void fixup_header_footer(void) { int pack_fd = pack_data->pack_fd; SHA_CTX c; @@ -659,7 +659,7 @@ static int oecmp (const void *a_, const void *b_) return hashcmp(a->sha1, b->sha1); } -static char* create_index() +static char* create_index(void) { static char tmpfile[PATH_MAX]; SHA_CTX ctx; @@ -744,7 +744,7 @@ static char* keep_pack(char *curr_index_name) return name; } -static void unkeep_all_packs() +static void unkeep_all_packs(void) { static char name[PATH_MAX]; int k; @@ -757,7 +757,7 @@ static void unkeep_all_packs() } } -static void end_packfile() +static void end_packfile(void) { struct packed_git *old_p = pack_data, *new_p; @@ -806,7 +806,7 @@ static void end_packfile() last_blob.depth = 0; } -static void checkpoint() +static void checkpoint(void) { end_packfile(); start_packfile(); @@ -1253,7 +1253,7 @@ del_entry: return 1; } -static void dump_branches() +static void dump_branches(void) { static const char *msg = "fast-import"; unsigned int i; @@ -1269,7 +1269,7 @@ static void dump_branches() } } -static void dump_tags() +static void dump_tags(void) { static const char *msg = "fast-import"; struct tag *t; @@ -1304,7 +1304,7 @@ static void dump_marks_helper(FILE *f, } } -static void dump_marks() +static void dump_marks(void) { if (mark_file) { @@ -1314,12 +1314,12 @@ static void dump_marks() } } -static void read_next_command() +static void read_next_command(void) { read_line(&command_buf, stdin, '\n'); } -static void cmd_mark() +static void cmd_mark(void) { if (!strncmp("mark :", command_buf.buf, 6)) { next_mark = strtoumax(command_buf.buf + 6, NULL, 10); @@ -1355,7 +1355,7 @@ static void* cmd_data (size_t *size) return buffer; } -static void cmd_new_blob() +static void cmd_new_blob(void) { size_t l; void *d; @@ -1368,7 +1368,7 @@ static void cmd_new_blob() free(d); } -static void unload_one_branch() +static void unload_one_branch(void) { while (cur_active_branches && cur_active_branches >= max_active_branches) { @@ -1601,7 +1601,7 @@ static struct hash_list* cmd_merge(unsigned int *count) return list; } -static void cmd_new_commit() +static void cmd_new_commit(void) { struct branch *b; void *msg; @@ -1715,7 +1715,7 @@ static void cmd_new_commit() } } -static void cmd_new_tag() +static void cmd_new_tag(void) { char *str_uq; const char *endp; @@ -1824,7 +1824,7 @@ static void cmd_new_tag() } } -static void cmd_reset_branch() +static void cmd_reset_branch(void) { struct branch *b; char *str_uq; @@ -1855,7 +1855,7 @@ static void cmd_reset_branch() cmd_from(b); } -static void cmd_checkpoint() +static void cmd_checkpoint(void) { if (object_count) checkpoint(); From 69e74e7412603dd536695c3d6a397673e8ae2bd2 Mon Sep 17 00:00:00 2001 From: "Shawn O. Pearce" Date: Wed, 17 Jan 2007 02:42:43 -0500 Subject: [PATCH 61/81] Correct packfile edge output in fast-import. Branches are only contained by a packfile if the branch actually had its most recent commit in that packfile. So new branches are set to MAX_PACK_ID to ensure they don't cause their commit to list as part of the first packfile when it closes out if the commit was actually in existance before fast-import started. Also corrected the type of last_commit to be umaxint_t to prevent overflow and wraparound on very large imports. Though that is highly unlikely to occur as we're talking 4 billion commits, which no real project has right now. Signed-off-by: Shawn O. Pearce --- fast-import.c | 23 +++++++++++++++-------- 1 file changed, 15 insertions(+), 8 deletions(-) diff --git a/fast-import.c b/fast-import.c index 84f855fb8e..a3073c5f03 100644 --- a/fast-import.c +++ b/fast-import.c @@ -111,12 +111,15 @@ Format of STDIN stream: #include "strbuf.h" #include "quote.h" +#define PACK_ID_BITS 16 +#define MAX_PACK_ID ((1<table_next_branch = branch_table[hc]; b->branch_tree.versions[0].mode = S_IFDIR; b->branch_tree.versions[1].mode = S_IFDIR; + b->pack_id = MAX_PACK_ID; branch_table[hc] = b; branch_count++; return b; @@ -1696,11 +1700,11 @@ static void cmd_new_commit(void) free(committer); free(msg); - store_object(OBJ_COMMIT, + if (!store_object(OBJ_COMMIT, new_data.buffer, sp - (char*)new_data.buffer, - NULL, b->sha1, next_mark); + NULL, b->sha1, next_mark)) + b->pack_id = pack_id; b->last_commit = object_count_by_type[OBJ_COMMIT]; - b->pack_id = pack_id; if (branch_log) { int need_dq = quote_c_style(b->name, NULL, NULL, 0); @@ -1807,9 +1811,12 @@ static void cmd_new_tag(void) free(tagger); free(msg); - store_object(OBJ_TAG, new_data.buffer, sp - (char*)new_data.buffer, - NULL, t->sha1, 0); - t->pack_id = pack_id; + if (store_object(OBJ_TAG, new_data.buffer, + sp - (char*)new_data.buffer, + NULL, t->sha1, 0)) + t->pack_id = MAX_PACK_ID; + else + t->pack_id = pack_id; if (branch_log) { int need_dq = quote_c_style(t->name, NULL, NULL, 0); From 566f44252b00003d1f4e7baaaf709d74bf73770f Mon Sep 17 00:00:00 2001 From: "Shawn O. Pearce" Date: Thu, 18 Jan 2007 11:26:06 -0500 Subject: [PATCH 62/81] Always use struct pack_header for pack header in fast-import. Previously we were using 'unsigned int' to update the hdr_entries field of the pack header after the file had been completed and was being hashed. This may not be 32 bits on all platforms. Instead we want to always uint32_t. I'm actually cheating here by just using the pack_header like the rest of Git and letting the struct definition declare the correct type. Right now that field is still 'unsigned int' (wrong) but a pending change submitted by Simon 'corecode' Schubert changes it to uint32_t. After that change is merged in fast-import will do the right thing all of the time. Signed-off-by: Shawn O. Pearce --- fast-import.c | 26 ++++++++++++++------------ 1 file changed, 14 insertions(+), 12 deletions(-) diff --git a/fast-import.c b/fast-import.c index a3073c5f03..fb7d912eff 100644 --- a/fast-import.c +++ b/fast-import.c @@ -624,29 +624,31 @@ static void start_packfile(void) static void fixup_header_footer(void) { + static const int buf_sz = 128 * 1024; int pack_fd = pack_data->pack_fd; SHA_CTX c; - char hdr[8]; - unsigned long cnt; + struct pack_header hdr; char *buf; if (lseek(pack_fd, 0, SEEK_SET) != 0) die("Failed seeking to start: %s", strerror(errno)); + if (read_in_full(pack_fd, &hdr, sizeof(hdr)) != sizeof(hdr)) + die("Unable to reread header of %s", pack_data->pack_name); + if (lseek(pack_fd, 0, SEEK_SET) != 0) + die("Failed seeking to start: %s", strerror(errno)); + hdr.hdr_entries = htonl(object_count); + write_or_die(pack_fd, &hdr, sizeof(hdr)); SHA1_Init(&c); - if (read_in_full(pack_fd, hdr, 8) != 8) - die("Unable to reread header of %s", pack_data->pack_name); - SHA1_Update(&c, hdr, 8); + SHA1_Update(&c, &hdr, sizeof(hdr)); - cnt = htonl(object_count); - SHA1_Update(&c, &cnt, 4); - write_or_die(pack_fd, &cnt, 4); - - buf = xmalloc(128 * 1024); + buf = xmalloc(buf_sz); for (;;) { - size_t n = xread(pack_fd, buf, 128 * 1024); - if (n <= 0) + size_t n = xread(pack_fd, buf, buf_sz); + if (!n) break; + if (n < 0) + die("Failed to checksum %s", pack_data->pack_name); SHA1_Update(&c, buf, n); } free(buf); From ebea9dd4f1b62cb3c8302f10aaca3af0231e9818 Mon Sep 17 00:00:00 2001 From: "Shawn O. Pearce" Date: Thu, 18 Jan 2007 11:30:17 -0500 Subject: [PATCH 63/81] Use fixed-size integers when writing out the index in fast-import. Currently the pack .idx file format uses 32-bit unsigned integers for the fan-out table and the object offsets. We had previously defined these as 'unsigned int', but not every system will define that type to be a 32 bit value. To ensure maximum portability we should always use 'uint32_t'. Signed-off-by: Shawn O. Pearce --- fast-import.c | 4 ++-- git-compat-util.h | 1 + 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/fast-import.c b/fast-import.c index fb7d912eff..7f519b4de3 100644 --- a/fast-import.c +++ b/fast-import.c @@ -672,7 +672,7 @@ static char* create_index(void) struct sha1file *f; struct object_entry **idx, **c, **last, *e; struct object_entry_pool *o; - unsigned int array[256]; + uint32_t array[256]; int i, idx_fd; /* Build the sorted table of object IDs. */ @@ -709,7 +709,7 @@ static char* create_index(void) sha1write(f, array, 256 * sizeof(int)); SHA1_Init(&ctx); for (c = idx; c != last; c++) { - unsigned int offset = htonl((*c)->offset); + uint32_t offset = htonl((*c)->offset); sha1write(f, &offset, 4); sha1write(f, (*c)->sha1, sizeof((*c)->sha1)); SHA1_Update(&ctx, (*c)->sha1, 20); diff --git a/git-compat-util.h b/git-compat-util.h index 614583e56a..ac06963e8d 100644 --- a/git-compat-util.h +++ b/git-compat-util.h @@ -46,6 +46,7 @@ #include #include #include +#include #include #ifndef NO_ICONV From e5808826c4abe183b4db9bae8f13445624696f66 Mon Sep 17 00:00:00 2001 From: "Shawn O. Pearce" Date: Thu, 18 Jan 2007 12:00:49 -0500 Subject: [PATCH 64/81] Remove unnecessary options from fast-import. The --objects command line option is rather unnecessary. Internally we allocate objects in 5000 unit blocks, ensuring that any sort of malloc overhead is ammortized over the individual objects to almost nothing. Since most frontends don't know how many objects they will need for a given import run (and its hard for them to predict without just doing the run) we probably won't see anyone using --objects. Further since there's really no major benefit to using the option, most frontends won't even bother supplying it even if they could estimate the number of objects. So I'm removing it. The --max-objects-per-pack option was probably a mistake to even have added in the first place. The packfile format is limited to 4 GiB today; given that objects need at least 3 bytes of data (and probably need even more) there's no way we are going to exceed the limit of 1<<32-1 objects before we reach the file size limit. So I'm removing it (to slightly reduce the complexity of the code) before anyone gets any wise ideas and tries to use it. Signed-off-by: Shawn O. Pearce --- fast-import.c | 16 ++++------------ 1 file changed, 4 insertions(+), 12 deletions(-) diff --git a/fast-import.c b/fast-import.c index 7f519b4de3..9a642f2e02 100644 --- a/fast-import.c +++ b/fast-import.c @@ -224,7 +224,6 @@ struct hash_list /* Configured limits on output */ static unsigned long max_depth = 10; static unsigned long max_packsize = (1LL << 32) - 1; -static uintmax_t max_objects = -1; /* Stats and misc. counters */ static uintmax_t alloc_count; @@ -900,9 +899,7 @@ static int store_object( deflateEnd(&s); /* Determine if we should auto-checkpoint. */ - if ((object_count + 1) > max_objects - || (object_count + 1) < object_count - || (pack_size + 60 + s.total_out) > max_packsize + if ((pack_size + 60 + s.total_out) > max_packsize || (pack_size + 60 + s.total_out) < pack_size) { /* This new object needs to *not* have the current pack_id. */ @@ -1872,12 +1869,11 @@ static void cmd_checkpoint(void) } static const char fast_import_usage[] = -"git-fast-import [--objects=n] [--depth=n] [--active-branches=n] [--export-marks=marks.file] [--branch-log=log]"; +"git-fast-import [--depth=n] [--active-branches=n] [--export-marks=marks.file] [--branch-log=log]"; int main(int argc, const char **argv) { int i; - uintmax_t est_obj_cnt = object_entry_alloc; uintmax_t total_count, duplicate_count; setup_ident(); @@ -1888,10 +1884,6 @@ int main(int argc, const char **argv) if (*a != '-' || !strcmp(a, "--")) break; - else if (!strncmp(a, "--objects=", 10)) - est_obj_cnt = strtoumax(a + 10, NULL, 0); - else if (!strncmp(a, "--max-objects-per-pack=", 23)) - max_objects = strtoumax(a + 23, NULL, 0); else if (!strncmp(a, "--max-pack-size=", 16)) max_packsize = strtoumax(a + 16, NULL, 0) * 1024 * 1024; else if (!strncmp(a, "--depth=", 8)) @@ -1911,7 +1903,7 @@ int main(int argc, const char **argv) if (i != argc) usage(fast_import_usage); - alloc_objects(est_obj_cnt); + alloc_objects(object_entry_alloc); strbuf_init(&command_buf); atom_table = xcalloc(atom_table_sz, sizeof(struct atom_str*)); @@ -1955,7 +1947,7 @@ int main(int argc, const char **argv) fprintf(stderr, "%s statistics:\n", argv[0]); fprintf(stderr, "---------------------------------------------------------------------\n"); - fprintf(stderr, "Alloc'd objects: %10ju (%10ju overflow )\n", alloc_count, alloc_count - est_obj_cnt); + fprintf(stderr, "Alloc'd objects: %10ju\n", alloc_count); fprintf(stderr, "Total objects: %10ju (%10ju duplicates )\n", total_count, duplicate_count); fprintf(stderr, " blobs : %10ju (%10ju duplicates %10ju deltas)\n", object_count_by_type[OBJ_BLOB], duplicate_count_by_type[OBJ_BLOB], delta_count_by_type[OBJ_BLOB]); fprintf(stderr, " trees : %10ju (%10ju duplicates %10ju deltas)\n", object_count_by_type[OBJ_TREE], duplicate_count_by_type[OBJ_TREE], delta_count_by_type[OBJ_TREE]); From 3b4dce02752d37c3cef9308eefb01ed758efe323 Mon Sep 17 00:00:00 2001 From: "Shawn O. Pearce" Date: Thu, 18 Jan 2007 13:14:27 -0500 Subject: [PATCH 65/81] Support delimited data regions in fast-import. During testing its nice to not have to feed the length of a data chunk to the 'data' command of fast-import. Instead we would prefer to be able to establish a data chunk much like shell's << operator and use a line delimiter to denote the end of the input. So now if a data command is started as 'data < --- fast-import.c | 64 +++++++++++++++++++++++++++++++++++++++------------ 1 file changed, 49 insertions(+), 15 deletions(-) diff --git a/fast-import.c b/fast-import.c index 9a642f2e02..90adc68042 100644 --- a/fast-import.c +++ b/fast-import.c @@ -50,14 +50,21 @@ Format of STDIN stream: # a new mark directive with the old idnum. # mark ::= 'mark' sp idnum lf; + data ::= (delimited_data | exact_data) + lf; + + # note: delim may be any string but must not contain lf. + # data_line may contain any data but must not be exactly + # delim. + delimited_data ::= 'data' sp '<<' delim lf + (data_line lf)* + delim lf; # note: declen indicates the length of binary_data in bytes. - # declen does not include the lf preceeding or trailing the - # binary data. + # declen does not include the lf preceeding the binary data. # - data ::= 'data' sp declen lf - binary_data - lf; + exact_data ::= 'data' sp declen lf + binary_data; # note: quoted strings are C-style quoting supporting \c for # common escapes of 'c' (e..g \n, \t, \\, \") or \nnn where nnn @@ -1334,21 +1341,48 @@ static void cmd_mark(void) static void* cmd_data (size_t *size) { - size_t n = 0; - void *buffer; size_t length; + char *buffer; if (strncmp("data ", command_buf.buf, 5)) die("Expected 'data n' command, found: %s", command_buf.buf); - length = strtoul(command_buf.buf + 5, NULL, 10); - buffer = xmalloc(length); - - while (n < length) { - size_t s = fread((char*)buffer + n, 1, length - n, stdin); - if (!s && feof(stdin)) - die("EOF in data (%lu bytes remaining)", length - n); - n += s; + if (!strncmp("<<", command_buf.buf + 5, 2)) { + char *term = xstrdup(command_buf.buf + 5 + 2); + size_t sz = 8192, term_len = command_buf.len - 5 - 2; + length = 0; + buffer = xmalloc(sz); + for (;;) { + read_next_command(); + if (command_buf.eof) + die("EOF in data (terminator '%s' not found)", term); + if (term_len == command_buf.len + && !strcmp(term, command_buf.buf)) + break; + if (sz < (length + command_buf.len)) { + sz = sz * 3 / 2 + 16; + if (sz < (length + command_buf.len)) + sz = length + command_buf.len; + buffer = xrealloc(buffer, sz); + } + memcpy(buffer + length, + command_buf.buf, + command_buf.len - 1); + length += command_buf.len - 1; + buffer[length++] = '\n'; + } + free(term); + } + else { + size_t n = 0; + length = strtoul(command_buf.buf + 5, NULL, 10); + buffer = xmalloc(length); + while (n < length) { + size_t s = fread(buffer + n, 1, length - n, stdin); + if (!s && feof(stdin)) + die("EOF in data (%lu bytes remaining)", length - n); + n += s; + } } if (fgetc(stdin) != '\n') From 50aee995121a103fe2698574e7f1d56660a5b89b Mon Sep 17 00:00:00 2001 From: "Shawn O. Pearce" Date: Thu, 18 Jan 2007 13:26:24 -0500 Subject: [PATCH 66/81] Create test case for fast-import. Now that its easier to craft test cases (thanks to 'data <<') we should start to verify fast-import works as expected. Signed-off-by: Shawn O. Pearce --- t/t9300-fast-import.sh | 184 +++++++++++++++++++++++++++++++++++++++++ 1 file changed, 184 insertions(+) create mode 100755 t/t9300-fast-import.sh diff --git a/t/t9300-fast-import.sh b/t/t9300-fast-import.sh new file mode 100755 index 0000000000..1463476b21 --- /dev/null +++ b/t/t9300-fast-import.sh @@ -0,0 +1,184 @@ +#!/bin/sh +# +# Copyright (c) 2007 Shawn Pearce +# + +test_description='test git-fast-import utility' +. ./test-lib.sh +. ../diff-lib.sh ;# test-lib chdir's into trash + +### +### series A +### + +test_tick +cat >input < $GIT_COMMITTER_DATE +data <expect < $GIT_COMMITTER_DATE +committer $GIT_COMMITTER_NAME <$GIT_COMMITTER_EMAIL> $GIT_COMMITTER_DATE + +initial +EOF +test_expect_success \ + 'A: verify commit' \ + 'git-cat-file commit master | sed 1d >actual && + diff -u expect actual' + +cat >expect <actual && + diff -u expect actual' + +cat >expect <actual && diff -u expect actual' + +cat >expect <actual && diff -u expect actual' + +printf abcd >expect +test_expect_success \ + 'A: verify file4' \ + 'git-cat-file blob master:file4 >actual && diff -u expect actual' + +cat >expect <input < $GIT_COMMITTER_DATE +data <input < $GIT_COMMITTER_DATE +data <expect < $GIT_COMMITTER_DATE +committer $GIT_COMMITTER_NAME <$GIT_COMMITTER_EMAIL> $GIT_COMMITTER_DATE + +second +EOF +test_expect_success \ + 'C: verify commit' \ + 'git-cat-file commit branch | sed 1d >actual && + diff -u expect actual' + +cat >expect <actual +test_expect_success \ + 'C: validate rename result' \ + 'compare_diff_raw expect actual' + +test_done From 8232dc427fb4b92b38e74e9e93b52231a67e354f Mon Sep 17 00:00:00 2001 From: "Shawn O. Pearce" Date: Thu, 18 Jan 2007 14:49:05 -0500 Subject: [PATCH 67/81] Reduce value duplication in t9300-fast-import. It is error prone to list the value of each file twice, instead we should list the value only once early in the script and reuse the shell variable when we need to access it. Signed-off-by: Shawn O. Pearce --- t/t9300-fast-import.sh | 34 +++++++++++++++++----------------- 1 file changed, 17 insertions(+), 17 deletions(-) diff --git a/t/t9300-fast-import.sh b/t/t9300-fast-import.sh index 1463476b21..40b8c073bd 100755 --- a/t/t9300-fast-import.sh +++ b/t/t9300-fast-import.sh @@ -7,6 +7,16 @@ test_description='test git-fast-import utility' . ./test-lib.sh . ../diff-lib.sh ;# test-lib chdir's into trash +file2_data='file2 +second line of EOF' + +file3_data='EOF +in 3rd file + END' + +file4_data=abcd +file4_len=4 + ### ### series A ### @@ -16,22 +26,19 @@ cat >input < $GIT_COMMITTER_DATE @@ -73,24 +80,17 @@ test_expect_success \ 'git-cat-file -p master^{tree} | sed "s/ [0-9a-f]* / /" >actual && diff -u expect actual' -cat >expect <expect test_expect_success \ 'A: verify file2' \ 'git-cat-file blob master:file2 >actual && diff -u expect actual' -cat >expect <expect test_expect_success \ 'A: verify file3' \ 'git-cat-file blob master:file3 >actual && diff -u expect actual' -printf abcd >expect +printf "$file4_data" >expect test_expect_success \ 'A: verify file4' \ 'git-cat-file blob master:file4 >actual && diff -u expect actual' From b715cfbba4083d25ec0d0f94e440ad734607ddb0 Mon Sep 17 00:00:00 2001 From: "Shawn O. Pearce" Date: Thu, 18 Jan 2007 15:17:58 -0500 Subject: [PATCH 68/81] Accept 'inline' file data in fast-import commit structure. Its very annoying to need to specify the file content ahead of a commit and use marks to connect the individual blobs to the commit's file modification entry, especially if the frontend can't/won't generate the blob SHA1s itself. Instead it would much easier to use if we can accept the blob data at the same time as we receive each file_change line. Now fast-import accepts 'inline' instead of a mark idnum or blob SHA1 within the 'M' type file_change command. If an inline is detected the very next line must be a 'data n' command, supplying the file data. Signed-off-by: Shawn O. Pearce --- fast-import.c | 29 ++++++++++++++++----- t/t9300-fast-import.sh | 59 ++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 82 insertions(+), 6 deletions(-) diff --git a/fast-import.c b/fast-import.c index 90adc68042..487a91a4ee 100644 --- a/fast-import.c +++ b/fast-import.c @@ -25,10 +25,11 @@ Format of STDIN stream: lf; commit_msg ::= data; - file_change ::= 'M' sp mode sp (hexsha1 | idnum) sp path_str lf - | 'D' sp path_str lf - ; - mode ::= '644' | '755'; + file_change ::= file_del | file_obm | file_inm; + file_del ::= 'D' sp path_str lf; + file_obm ::= 'M' sp mode sp (hexsha1 | idnum) sp path_str lf; + file_inm ::= 'M' sp mode sp 'inline' sp path_str lf + data; new_tag ::= 'tag' sp tag_str lf 'from' sp (ref_str | hexsha1 | sha1exp_str | idnum) lf @@ -77,6 +78,10 @@ Format of STDIN stream: sha1exp_str ::= sha1exp | '"' quoted(sha1exp) '"' ; tag_str ::= tag | '"' quoted(tag) '"' ; path_str ::= path | '"' quoted(path) '"' ; + mode ::= '100644' | '644' + | '100755' | '755' + | '140000' + ; declen ::= # unsigned 32 bit value, ascii base10 notation; bigint ::= # unsigned integer value, ascii base10 notation; @@ -1452,7 +1457,7 @@ static void file_change_m(struct branch *b) const char *endp; struct object_entry *oe; unsigned char sha1[20]; - unsigned int mode; + unsigned int mode, inline_data = 0; char type[20]; p = get_mode(p, &mode); @@ -1475,6 +1480,9 @@ static void file_change_m(struct branch *b) oe = find_mark(strtoumax(p + 1, &x, 10)); hashcpy(sha1, oe->sha1); p = x; + } else if (!strncmp("inline", p, 6)) { + inline_data = 1; + p += 6; } else { if (get_sha1_hex(p, sha1)) die("Invalid SHA1: %s", command_buf.buf); @@ -1491,7 +1499,16 @@ static void file_change_m(struct branch *b) p = p_uq; } - if (oe) { + if (inline_data) { + size_t l; + void *d; + if (!p_uq) + p = p_uq = xstrdup(p); + read_next_command(); + d = cmd_data(&l); + if (store_object(OBJ_BLOB, d, l, &last_blob, sha1, 0)) + free(d); + } else if (oe) { if (oe->type != OBJ_BLOB) die("Not a blob (actually a %s): %s", command_buf.buf, type_names[oe->type]); diff --git a/t/t9300-fast-import.sh b/t/t9300-fast-import.sh index 40b8c073bd..a5cc846b34 100755 --- a/t/t9300-fast-import.sh +++ b/t/t9300-fast-import.sh @@ -17,6 +17,12 @@ in 3rd file file4_data=abcd file4_len=4 +file5_data='an inline file. + we should see it later.' + +file6_data='#!/bin/sh +echo "$@"' + ### ### series A ### @@ -181,4 +187,57 @@ test_expect_success \ 'C: validate rename result' \ 'compare_diff_raw expect actual' +### +### series D +### + +test_tick +cat >input < $GIT_COMMITTER_DATE +data <expect <actual +test_expect_success \ + 'D: validate new files added' \ + 'compare_diff_raw expect actual' + +echo "$file5_data" >expect +test_expect_success \ + 'D: verify file5' \ + 'git-cat-file blob branch:newdir/interesting >actual && + diff -u expect actual' + +echo "$file6_data" >expect +test_expect_success \ + 'D: verify file6' \ + 'git-cat-file blob branch:newdir/exec.sh >actual && + diff -u expect actual' + test_done From 8c1f22da9f8124dfabb5da8476845250b5c35ae8 Mon Sep 17 00:00:00 2001 From: "Shawn O. Pearce" Date: Mon, 5 Feb 2007 16:05:11 -0500 Subject: [PATCH 69/81] Include checkpoint command in the BNF. This command isn't encouraged (as its slow) but it does exist and is accepted, so it still should be covered in the BNF. Signed-off-by: Shawn O. Pearce --- fast-import.c | 1 + 1 file changed, 1 insertion(+) diff --git a/fast-import.c b/fast-import.c index f62a5a9f6c..1559f9c0ff 100644 --- a/fast-import.c +++ b/fast-import.c @@ -7,6 +7,7 @@ Format of STDIN stream: | new_commit | new_tag | reset_branch + | checkpoint ; new_blob ::= 'blob' lf From 10831c551323121bdab06c3eaf2f52c6658fd6b8 Mon Sep 17 00:00:00 2001 From: "Shawn O. Pearce" Date: Mon, 5 Feb 2007 16:34:56 -0500 Subject: [PATCH 70/81] Reduce memory usage of fast-import. Some structs are allocated rather frequently, but were using integer types which were far larger than required to actually store their full value range. As packfiles are limited to 4 GiB we don't need more than 32 bits to store the offset of an object within that packfile, an `unsigned long` on a 64 bit system is likely a 64 bit unsigned value. Saving 4 bytes per object on a 64 bit system can add up fast on any sizable import. As atom strings are strictly single components in a path name these are probably limited to just 255 bytes by the underlying OS. Going to that short of a string is probably too restrictive, but certainly `unsigned int` is far too large for their lengths. `unsigned short` is a reasonable limit. Modes within a tree really only need two bytes to store their whole value; using `unsigned int` here is vast overkill. Saving 4 bytes per file entry in an active branch can add up quickly on a project with a large number of files. Signed-off-by: Shawn O. Pearce --- fast-import.c | 24 ++++++++++++------------ 1 file changed, 12 insertions(+), 12 deletions(-) diff --git a/fast-import.c b/fast-import.c index 1559f9c0ff..9658c28413 100644 --- a/fast-import.c +++ b/fast-import.c @@ -130,7 +130,7 @@ Format of STDIN stream: struct object_entry { struct object_entry *next; - unsigned long offset; + uint32_t offset; unsigned type : TYPE_BITS; unsigned pack_id : PACK_ID_BITS; unsigned char sha1[20]; @@ -157,7 +157,7 @@ struct last_object { void *data; unsigned long len; - unsigned long offset; + uint32_t offset; unsigned int depth; unsigned no_free:1; }; @@ -173,7 +173,7 @@ struct mem_pool struct atom_str { struct atom_str *next_atom; - unsigned int str_len; + unsigned short str_len; char str_dat[FLEX_ARRAY]; /* more */ }; @@ -184,7 +184,7 @@ struct tree_entry struct atom_str* name; struct tree_entry_ms { - unsigned int mode; + uint16_t mode; unsigned char sha1[20]; } versions[2]; }; @@ -464,7 +464,7 @@ static struct object_entry* find_mark(uintmax_t idnum) return oe; } -static struct atom_str* to_atom(const char *s, size_t len) +static struct atom_str* to_atom(const char *s, unsigned short len) { unsigned int hc = hc_str(s, len) % atom_table_sz; struct atom_str *c; @@ -993,10 +993,10 @@ static void *gfi_unpack_entry( return unpack_entry(p, oe->offset, type, sizep); } -static const char *get_mode(const char *str, unsigned int *modep) +static const char *get_mode(const char *str, uint16_t *modep) { unsigned char c; - unsigned int mode = 0; + uint16_t mode = 0; while ((c = *str++) != ' ') { if (c < '0' || c > '7') @@ -1046,7 +1046,7 @@ static void load_tree(struct tree_entry *root) if (!c) die("Corrupt mode in %s", sha1_to_hex(sha1)); e->versions[0].mode = e->versions[1].mode; - e->name = to_atom(c, strlen(c)); + e->name = to_atom(c, (unsigned short)strlen(c)); c += e->name->str_len + 1; hashcpy(e->versions[0].sha1, (unsigned char*)c); hashcpy(e->versions[1].sha1, (unsigned char*)c); @@ -1098,7 +1098,7 @@ static void mktree(struct tree_content *t, struct tree_entry *e = t->entries[i]; if (!e->versions[v].mode) continue; - c += sprintf(c, "%o", e->versions[v].mode); + c += sprintf(c, "%o", (unsigned int)e->versions[v].mode); *c++ = ' '; strcpy(c, e->name->str_dat); c += e->name->str_len + 1; @@ -1161,7 +1161,7 @@ static int tree_content_set( struct tree_entry *root, const char *p, const unsigned char *sha1, - const unsigned int mode) + const uint16_t mode) { struct tree_content *t = root->tree; const char *slash1; @@ -1207,7 +1207,7 @@ static int tree_content_set( if (t->entry_count == t->entry_capacity) root->tree = t = grow_tree_content(t, 8); e = new_tree_entry(); - e->name = to_atom(p, n); + e->name = to_atom(p, (unsigned short)n); e->versions[0].mode = 0; hashclr(e->versions[0].sha1); t->entries[t->entry_count++] = e; @@ -1458,7 +1458,7 @@ static void file_change_m(struct branch *b) const char *endp; struct object_entry *oe; unsigned char sha1[20]; - unsigned int mode, inline_data = 0; + uint16_t mode, inline_data = 0; char type[20]; p = get_mode(p, &mode); From 6c3aac1c69ea0bcb2896bec96a01fdf8aa6176fa Mon Sep 17 00:00:00 2001 From: "Shawn O. Pearce" Date: Mon, 5 Feb 2007 20:30:37 -0500 Subject: [PATCH 71/81] Don't support shell-quoted refnames in fast-import. The current implementation of shell-style quoted refnames and SHA-1 expressions within fast-import contains a bad memory leak. We leak the unquoted strings used by the `from` and `merge` commands, maybe others. Its also just muddling up the docs. Since Git refnames cannot contain LF, and that is our delimiter for the end of the refname, and we accept any other character as-is, there is no reason for these strings to support quoting, except to be nice to frontends. But frontends shouldn't be expecting to use funny refs in Git, and its just as simple to never quote them as it is to always pass them through the same quoting filter as pathnames. So frontends should never quote refs, or ref expressions. Signed-off-by: Shawn O. Pearce --- fast-import.c | 67 ++++----------------------------------------------- 1 file changed, 5 insertions(+), 62 deletions(-) diff --git a/fast-import.c b/fast-import.c index 9658c28413..e6342386fc 100644 --- a/fast-import.c +++ b/fast-import.c @@ -75,9 +75,9 @@ Format of STDIN stream: # stream formatting is: \, " and LF. Otherwise these values # are UTF8. # - ref_str ::= ref | '"' quoted(ref) '"' ; - sha1exp_str ::= sha1exp | '"' quoted(sha1exp) '"' ; - tag_str ::= tag | '"' quoted(tag) '"' ; + ref_str ::= ref; + sha1exp_str ::= sha1exp; + tag_str ::= tag; path_str ::= path | '"' quoted(path) '"' ; mode ::= '100644' | '644' | '100755' | '755' @@ -1546,8 +1546,7 @@ static void file_change_d(struct branch *b) static void cmd_from(struct branch *b) { - const char *from, *endp; - char *str_uq; + const char *from; struct branch *s; if (strncmp("from ", command_buf.buf, 5)) @@ -1557,13 +1556,6 @@ static void cmd_from(struct branch *b) die("Can't reinitailize branch %s", b->name); from = strchr(command_buf.buf, ' ') + 1; - str_uq = unquote_c_style(from, &endp); - if (str_uq) { - if (*endp) - die("Garbage after string in: %s", command_buf.buf); - from = str_uq; - } - s = lookup_branch(from); if (b == s) die("Can't create a branch from itself: %s", b->name); @@ -1617,20 +1609,12 @@ static void cmd_from(struct branch *b) static struct hash_list* cmd_merge(unsigned int *count) { struct hash_list *list = NULL, *n, *e; - const char *from, *endp; - char *str_uq; + const char *from; struct branch *s; *count = 0; while (!strncmp("merge ", command_buf.buf, 6)) { from = strchr(command_buf.buf, ' ') + 1; - str_uq = unquote_c_style(from, &endp); - if (str_uq) { - if (*endp) - die("Garbage after string in: %s", command_buf.buf); - from = str_uq; - } - n = xmalloc(sizeof(*n)); s = lookup_branch(from); if (s) @@ -1661,8 +1645,6 @@ static void cmd_new_commit(void) struct branch *b; void *msg; size_t msglen; - char *str_uq; - const char *endp; char *sp; char *author = NULL; char *committer = NULL; @@ -1671,17 +1653,9 @@ static void cmd_new_commit(void) /* Obtain the branch name from the rest of our command */ sp = strchr(command_buf.buf, ' ') + 1; - str_uq = unquote_c_style(sp, &endp); - if (str_uq) { - if (*endp) - die("Garbage after ref in: %s", command_buf.buf); - sp = str_uq; - } b = lookup_branch(sp); if (!b) b = new_branch(sp); - if (str_uq) - free(str_uq); read_next_command(); cmd_mark(); @@ -1772,8 +1746,6 @@ static void cmd_new_commit(void) static void cmd_new_tag(void) { - char *str_uq; - const char *endp; char *sp; const char *from; char *tagger; @@ -1786,12 +1758,6 @@ static void cmd_new_tag(void) /* Obtain the new tag name from the rest of our command */ sp = strchr(command_buf.buf, ' ') + 1; - str_uq = unquote_c_style(sp, &endp); - if (str_uq) { - if (*endp) - die("Garbage after tag name in: %s", command_buf.buf); - sp = str_uq; - } t = pool_alloc(sizeof(struct tag)); t->next_tag = NULL; t->name = pool_strdup(sp); @@ -1800,22 +1766,12 @@ static void cmd_new_tag(void) else first_tag = t; last_tag = t; - if (str_uq) - free(str_uq); read_next_command(); /* from ... */ if (strncmp("from ", command_buf.buf, 5)) die("Expected from command, got %s", command_buf.buf); - from = strchr(command_buf.buf, ' ') + 1; - str_uq = unquote_c_style(from, &endp); - if (str_uq) { - if (*endp) - die("Garbage after string in: %s", command_buf.buf); - from = str_uq; - } - s = lookup_branch(from); if (s) { hashcpy(sha1, s->sha1); @@ -1836,9 +1792,6 @@ static void cmd_new_tag(void) free(buf); } else die("Invalid ref name or SHA1 expression: %s", from); - - if (str_uq) - free(str_uq); read_next_command(); /* tagger ... */ @@ -1885,18 +1838,10 @@ static void cmd_new_tag(void) static void cmd_reset_branch(void) { struct branch *b; - char *str_uq; - const char *endp; char *sp; /* Obtain the branch name from the rest of our command */ sp = strchr(command_buf.buf, ' ') + 1; - str_uq = unquote_c_style(sp, &endp); - if (str_uq) { - if (*endp) - die("Garbage after ref in: %s", command_buf.buf); - sp = str_uq; - } b = lookup_branch(sp); if (b) { b->last_commit = 0; @@ -1907,8 +1852,6 @@ static void cmd_reset_branch(void) } else b = new_branch(sp); - if (str_uq) - free(str_uq); read_next_command(); cmd_from(b); } From 6e411d2044072072692f2d9cf9d633421ef6017a Mon Sep 17 00:00:00 2001 From: "Shawn O. Pearce" Date: Mon, 5 Feb 2007 21:09:25 -0500 Subject: [PATCH 72/81] Initial draft of fast-import documentation. This is a first pass at the manpage for git-fast-import. I have tried to cover the input format in extreme detail, creating a reference which is more detailed than the BNF grammar appearing in the header of fast-import.c. I have also covered some details about gfi's performance and memory utilization, as well as the average learning curve required to create a gfi frontend application (as it is far lower than it might appear on first glance). The documentation still lacks real example input streams, which may turn out to be difficult to format in asciidoc due to the blank lines which carry meaning within the format. Signed-off-by: Shawn O. Pearce --- Documentation/git-fast-import.txt | 655 ++++++++++++++++++++++++++++++ 1 file changed, 655 insertions(+) create mode 100644 Documentation/git-fast-import.txt diff --git a/Documentation/git-fast-import.txt b/Documentation/git-fast-import.txt new file mode 100644 index 0000000000..16308731fb --- /dev/null +++ b/Documentation/git-fast-import.txt @@ -0,0 +1,655 @@ +git-fast-import(1) +================== + +NAME +---- +git-fast-import - Backend for fast Git data importers. + + +SYNOPSIS +-------- +frontend | 'git-fast-import' [options] + +DESCRIPTION +----------- +This program is usually not what the end user wants to run directly. +Most end users want to use one of the existing frontend programs, +which parses a specific type of foreign source and feeds the contents +stored there to git-fast-import (gfi). + +gfi reads a mixed command/data stream from standard input and +writes one or more packfiles directly into the current repository. +When EOF is received on standard input, fast import writes out +updated branch and tag refs, fully updating the current repository +with the newly imported data. + +The gfi backend itself can import into an empty repository (one that +has already been initialized by gitlink:git-init[1]) or incrementally +update an existing populated repository. Whether or not incremental +imports are supported from a particular foreign source depends on +the frontend program in use. + + +OPTIONS +------- +--max-pack-size=:: + Maximum size of each output packfile, expressed in MiB. + The default is 4096 (4 GiB) as that is the maximum allowed + packfile size (due to file format limitations). Some + importers may wish to lower this, such as to ensure the + resulting packfiles fit on CDs. + +--depth=:: + Maximum delta depth, for blob and tree deltification. + Default is 10. + +--active-branches=:: + Maximum number of branches to maintain active at once. + See ``Memory Utilization'' below for details. Default is 5. + +--export-marks=:: + Dumps the internal marks table to when complete. + Marks are written one per line as `:markid SHA-1`. + Frontends can use this file to validate imports after they + have been completed. + +--branch-log=:: + Records every tag and commit made to a log file. (This file + can be quite verbose on large imports.) This particular + option has been primarily intended to facilitate debugging + gfi and has limited usefulness in other contexts. It may + be removed in future versions. + + +Performance +----------- +The design of gfi allows it to import large projects in a minimum +amount of memory usage and processing time. Assuming the frontend +is able to keep up with gfi and feed it a constant stream of data, +import times for projects holding 10+ years of history and containing +100,000+ individual commits are generally completed in just 1-2 +hours on quite modest (~$2,000 USD) hardware. + +Most bottlenecks appear to be in foreign source data access (the +source just cannot extract revisions fast enough) or disk IO (gfi +writes as fast as the disk will take the data). Imports will run +faster if the source data is stored on a different drive than the +destination Git repository (due to less IO contention). + + +Development Cost +---------------- +A typical frontend for gfi tends to weigh in at approximately 200 +lines of Perl/Python/Ruby code. Most developers have been able to +create working importers in just a couple of hours, even though it +is their first exposure to gfi, and sometimes even to Git. This is +an ideal situation, given that most conversion tools are throw-away +(use once, and never look back). + + +Parallel Operation +------------------ +Like `git-push` or `git-fetch`, imports handled by gfi are safe to +run alongside parallel `git repack -a -d` or `git gc` invocations, +or any other Git operation (including `git prune`, as loose objects +are never used by gfi). + +However, gfi does not lock the branch or tag refs it is actively +importing. After EOF, during its ref update phase, gfi blindly +overwrites each imported branch or tag ref. Consequently it is not +safe to modify refs that are currently being used by a running gfi +instance, as work could be lost when gfi overwrites the refs. + + +Technical Discussion +-------------------- +gfi tracks a set of branches in memory. Any branch can be created +or modified at any point during the import process by sending a +`commit` command on the input stream. This design allows a frontend +program to process an unlimited number of branches simultaneously, +generating commits in the order they are available from the source +data. It also simplifies the frontend programs considerably. + +gfi does not use or alter the current working directory, or any +file within it. (It does however update the current Git repository, +as referenced by `GIT_DIR`.) Therefore an import frontend may use +the working directory for its own purposes, such as extracting file +revisions from the foreign source. This ignorance of the working +directory also allows gfi to run very quickly, as it does not +need to perform any costly file update operations when switching +between branches. + +Input Format +------------ +With the exception of raw file data (which Git does not interpret) +the gfi input format is text (ASCII) based. This text based +format simplifies development and debugging of frontend programs, +especially when a higher level language such as Perl, Python or +Ruby is being used. + +gfi is very strict about its input. Where we say SP below we mean +*exactly* one space. Likewise LF means one (and only one) linefeed. +Supplying additional whitespace characters will cause unexpected +results, such as branch names or file names with leading or trailing +spaces in their name, or early termination of gfi when it encounters +unexpected input. + +Commands +~~~~~~~~ +gfi accepts several commands to update the current repository +and control the current import process. More detailed discussion +(with examples) of each command follows later. + +`commit`:: + Creates a new branch or updates an existing branch by + creating a new commit and updating the branch to point at + the newly created commit. + +`tag`:: + Creates an annotated tag object from an existing commit or + branch. Lightweight tags are not supported by this command, + as they are not recommended for recording meaningful points + in time. + +`reset`:: + Reset an existing branch (or a new branch) to a specific + revision. This command must be used to change a branch to + a specific revision without making a commit on it. + +`blob`:: + Convert raw file data into a blob, for future use in a + `commit` command. This command is optional and is not + needed to perform an import. + +`checkpoint`:: + Forces gfi to close the current packfile, generate its + unique SHA-1 checksum and index, and start a new packfile. + This command is optional and is not needed to perform + an import. + +`commit` +~~~~~~~~ +Create or update a branch with a new commit, recording one logical +change to the project. + +.... + 'commit' SP LF + mark? + ('author' SP SP LT GT SP