From db5e523fddd2a1a47d9ea63498734d0141925513 Mon Sep 17 00:00:00 2001
From: "Shawn O. Pearce" <spearce@spearce.org>
Date: Sat, 5 Aug 2006 02:04:21 -0400
Subject: [PATCH 01/81] Created fast-import, a tool to quickly generating a
 pack from blobs.

Signed-off-by: Shawn O. Pearce <spearce@spearce.org>
---
 .gitignore    |   1 +
 Makefile      |   1 +
 fast-import.c | 214 ++++++++++++++++++++++++++++++++++++++++++++++++++
 3 files changed, 216 insertions(+)
 create mode 100644 fast-import.c

diff --git a/.gitignore b/.gitignore
index 55cd9844d6..8ddccd7dac 100644
--- a/.gitignore
+++ b/.gitignore
@@ -32,6 +32,7 @@ git-diff-index
 git-diff-stages
 git-diff-tree
 git-describe
+git-fast-import
 git-fetch
 git-fetch-pack
 git-findtags
diff --git a/Makefile b/Makefile
index b15b420ea2..a37f74a1ff 100644
--- a/Makefile
+++ b/Makefile
@@ -186,6 +186,7 @@ SIMPLE_PROGRAMS = \
 PROGRAMS = \
 	git-convert-objects$X git-fetch-pack$X git-fsck-objects$X \
 	git-hash-object$X git-index-pack$X git-local-fetch$X \
+	git-fast-import$X \
 	git-merge-base$X \
 	git-merge-index$X git-mktag$X git-mktree$X git-patch-id$X \
 	git-peek-remote$X git-receive-pack$X \
diff --git a/fast-import.c b/fast-import.c
new file mode 100644
index 0000000000..416ba5c7c6
--- /dev/null
+++ b/fast-import.c
@@ -0,0 +1,214 @@
+#include "builtin.h"
+#include "cache.h"
+#include "object.h"
+#include "blob.h"
+#include "delta.h"
+#include "pack.h"
+#include "csum-file.h"
+
+static int max_depth = 10;
+static unsigned long object_count;
+static int packfd;
+static int current_depth;
+static void *lastdat;
+static unsigned long lastdatlen;
+static unsigned char lastsha1[20];
+
+static ssize_t yread(int fd, void *buffer, size_t length)
+{
+	ssize_t ret = 0;
+	while (ret < length) {
+		ssize_t size = xread(fd, (char *) buffer + ret, length - ret);
+		if (size < 0) {
+			return size;
+		}
+		if (size == 0) {
+			return ret;
+		}
+		ret += size;
+	}
+	return ret;
+}
+
+static ssize_t ywrite(int fd, void *buffer, size_t length)
+{
+	ssize_t ret = 0;
+	while (ret < length) {
+		ssize_t size = xwrite(fd, (char *) buffer + ret, length - ret);
+		if (size < 0) {
+			return size;
+		}
+		if (size == 0) {
+			return ret;
+		}
+		ret += size;
+	}
+	return ret;
+}
+
+static unsigned long encode_header(enum object_type type, unsigned long size, unsigned char *hdr)
+{
+	int n = 1;
+	unsigned char c;
+
+	if (type < OBJ_COMMIT || type > OBJ_DELTA)
+		die("bad type %d", type);
+
+	c = (type << 4) | (size & 15);
+	size >>= 4;
+	while (size) {
+		*hdr++ = c | 0x80;
+		c = size & 0x7f;
+		size >>= 7;
+		n++;
+	}
+	*hdr = c;
+	return n;
+}
+
+static void write_blob (void *dat, unsigned long datlen)
+{
+	z_stream s;
+	void *out, *delta;
+	unsigned char hdr[64];
+	unsigned long hdrlen, deltalen;
+
+	if (lastdat && current_depth < max_depth) {
+		delta = diff_delta(lastdat, lastdatlen,
+			dat, datlen,
+			&deltalen, 0);
+	} else
+		delta = 0;
+
+	memset(&s, 0, sizeof(s));
+	deflateInit(&s, zlib_compression_level);
+
+	if (delta) {
+		current_depth++;
+		s.next_in = delta;
+		s.avail_in = deltalen;
+		hdrlen = encode_header(OBJ_DELTA, deltalen, hdr);
+		if (ywrite(packfd, hdr, hdrlen) != hdrlen)
+			die("Can't write object header: %s", strerror(errno));
+		if (ywrite(packfd, lastsha1, sizeof(lastsha1)) != sizeof(lastsha1))
+			die("Can't write object base: %s", strerror(errno));
+	} else {
+		current_depth = 0;
+		s.next_in = dat;
+		s.avail_in = datlen;
+		hdrlen = encode_header(OBJ_BLOB, datlen, hdr);
+		if (ywrite(packfd, hdr, hdrlen) != hdrlen)
+			die("Can't write object header: %s", strerror(errno));
+	}
+
+	s.avail_out = deflateBound(&s, s.avail_in);
+	s.next_out = out = xmalloc(s.avail_out);
+	while (deflate(&s, Z_FINISH) == Z_OK)
+		/* nothing */;
+	deflateEnd(&s);
+
+	if (ywrite(packfd, out, s.total_out) != s.total_out)
+		die("Failed writing compressed data %s", strerror(errno));
+
+	free(out);
+	if (delta)
+		free(delta);
+}
+
+static void init_pack_header ()
+{
+	const char* magic = "PACK";
+	unsigned long version = 2;
+	unsigned long zero = 0;
+
+	version = htonl(version);
+
+	if (ywrite(packfd, (char*)magic, 4) != 4)
+		die("Can't write pack magic: %s", strerror(errno));
+	if (ywrite(packfd, &version, 4) != 4)
+		die("Can't write pack version: %s", strerror(errno));
+	if (ywrite(packfd, &zero, 4) != 4)
+		die("Can't write 0 object count: %s", strerror(errno));
+}
+
+static void fixup_header_footer ()
+{
+	SHA_CTX c;
+	char hdr[8];
+	unsigned char sha1[20];
+	unsigned long cnt;
+	char *buf;
+	size_t n;
+
+	if (lseek(packfd, 0, SEEK_SET) != 0)
+		die("Failed seeking to start: %s", strerror(errno));
+
+	SHA1_Init(&c);
+	if (yread(packfd, hdr, 8) != 8)
+		die("Failed reading header: %s", strerror(errno));
+	SHA1_Update(&c, hdr, 8);
+
+fprintf(stderr, "%lu objects\n", object_count);
+	cnt = htonl(object_count);
+	SHA1_Update(&c, &cnt, 4);
+	if (ywrite(packfd, &cnt, 4) != 4)
+		die("Failed writing object count: %s", strerror(errno));
+
+	buf = xmalloc(128 * 1024);
+	for (;;) {
+		n = xread(packfd, buf, 128 * 1024);
+		if (n <= 0)
+			break;
+		SHA1_Update(&c, buf, n);
+	}
+	free(buf);
+
+	SHA1_Final(sha1, &c);
+	if (ywrite(packfd, sha1, sizeof(sha1)) != sizeof(sha1))
+		die("Failed writing pack checksum: %s", strerror(errno));
+}
+
+int main (int argc, const char **argv)
+{
+	packfd = open(argv[1], O_RDWR|O_CREAT|O_TRUNC, 0666);
+	if (packfd < 0)
+		die("Can't create pack file %s: %s", argv[1], strerror(errno));
+
+	init_pack_header();
+	for (;;) {
+		unsigned long datlen;
+		int hdrlen;
+		void *dat;
+		char hdr[128];
+		unsigned char sha1[20];
+		SHA_CTX c;
+
+		if (yread(0, &datlen, 4) != 4)
+			break;
+
+		dat = xmalloc(datlen);
+		if (yread(0, dat, datlen) != datlen)
+			break;
+
+		hdrlen = sprintf(hdr, "blob %lu", datlen) + 1;
+		SHA1_Init(&c);
+		SHA1_Update(&c, hdr, hdrlen);
+		SHA1_Update(&c, dat, datlen);
+		SHA1_Final(sha1, &c);
+
+		write_blob(dat, datlen);
+		object_count++;
+		printf("%s\n", sha1_to_hex(sha1));
+		fflush(stdout);
+
+		if (lastdat)
+			free(lastdat);
+		lastdat = dat;
+		lastdatlen = datlen;
+		memcpy(lastsha1, sha1, sizeof(sha1));
+	}
+	fixup_header_footer();
+	close(packfd);
+
+	return 0;
+}

From 8bcce30126b90af83c1291e072f74950e73a2584 Mon Sep 17 00:00:00 2001
From: "Shawn O. Pearce" <spearce@spearce.org>
Date: Sun, 6 Aug 2006 13:51:39 -0400
Subject: [PATCH 02/81] Added automatic index generation to fast-import.

Signed-off-by: Shawn O. Pearce <spearce@spearce.org>
---
 fast-import.c | 182 ++++++++++++++++++++++++++++++++++++++++++++------
 1 file changed, 163 insertions(+), 19 deletions(-)

diff --git a/fast-import.c b/fast-import.c
index 416ba5c7c6..0d95118499 100644
--- a/fast-import.c
+++ b/fast-import.c
@@ -8,11 +8,75 @@
 
 static int max_depth = 10;
 static unsigned long object_count;
+static unsigned long duplicate_count;
+static unsigned long packoff;
+static unsigned long overflow_count;
 static int packfd;
 static int current_depth;
 static void *lastdat;
 static unsigned long lastdatlen;
 static unsigned char lastsha1[20];
+static unsigned char packsha1[20];
+
+struct object_entry
+{
+	struct object_entry *next;
+	unsigned long offset;
+	unsigned char sha1[20];
+};
+
+struct overflow_object_entry
+{
+	struct overflow_object_entry *next;
+	struct object_entry oe;
+};
+
+struct object_entry *pool_start;
+struct object_entry *pool_next;
+struct object_entry *pool_end;
+struct overflow_object_entry *overflow;
+struct object_entry *table[1 << 16];
+
+static struct object_entry* new_object(unsigned char *sha1)
+{
+	if (pool_next != pool_end) {
+		struct object_entry *e = pool_next++;
+		memcpy(e->sha1, sha1, sizeof(e->sha1));
+		return e;
+	} else {
+		struct overflow_object_entry *e;
+
+		e = xmalloc(sizeof(struct overflow_object_entry));
+		e->next = overflow;
+		memcpy(e->oe.sha1, sha1, sizeof(e->oe.sha1));
+		overflow = e;
+		overflow_count++;
+		return &e->oe;
+	}
+}
+
+static struct object_entry* insert_object(unsigned char *sha1)
+{
+	unsigned int h = sha1[0] << 8 | sha1[1];
+	struct object_entry *e = table[h];
+	struct object_entry *p = 0;
+
+	while (e) {
+		if (!memcmp(sha1, e->sha1, sizeof(e->sha1)))
+			return e;
+		p = e;
+		e = e->next;
+	}
+
+	e = new_object(sha1);
+	e->next = 0;
+	e->offset = 0;
+	if (p)
+		p->next = e;
+	else
+		table[h] = e;
+	return e;
+}
 
 static ssize_t yread(int fd, void *buffer, size_t length)
 {
@@ -66,7 +130,7 @@ static unsigned long encode_header(enum object_type type, unsigned long size, un
 	return n;
 }
 
-static void write_blob (void *dat, unsigned long datlen)
+static void write_blob(void *dat, unsigned long datlen)
 {
 	z_stream s;
 	void *out, *delta;
@@ -92,6 +156,7 @@ static void write_blob (void *dat, unsigned long datlen)
 			die("Can't write object header: %s", strerror(errno));
 		if (ywrite(packfd, lastsha1, sizeof(lastsha1)) != sizeof(lastsha1))
 			die("Can't write object base: %s", strerror(errno));
+		packoff += hdrlen + sizeof(lastsha1);
 	} else {
 		current_depth = 0;
 		s.next_in = dat;
@@ -99,6 +164,7 @@ static void write_blob (void *dat, unsigned long datlen)
 		hdrlen = encode_header(OBJ_BLOB, datlen, hdr);
 		if (ywrite(packfd, hdr, hdrlen) != hdrlen)
 			die("Can't write object header: %s", strerror(errno));
+		packoff += hdrlen;
 	}
 
 	s.avail_out = deflateBound(&s, s.avail_in);
@@ -109,13 +175,14 @@ static void write_blob (void *dat, unsigned long datlen)
 
 	if (ywrite(packfd, out, s.total_out) != s.total_out)
 		die("Failed writing compressed data %s", strerror(errno));
+	packoff += s.total_out;
 
 	free(out);
 	if (delta)
 		free(delta);
 }
 
-static void init_pack_header ()
+static void init_pack_header()
 {
 	const char* magic = "PACK";
 	unsigned long version = 2;
@@ -129,13 +196,13 @@ static void init_pack_header ()
 		die("Can't write pack version: %s", strerror(errno));
 	if (ywrite(packfd, &zero, 4) != 4)
 		die("Can't write 0 object count: %s", strerror(errno));
+	packoff = 4 * 3;
 }
 
-static void fixup_header_footer ()
+static void fixup_header_footer()
 {
 	SHA_CTX c;
 	char hdr[8];
-	unsigned char sha1[20];
 	unsigned long cnt;
 	char *buf;
 	size_t n;
@@ -148,7 +215,6 @@ static void fixup_header_footer ()
 		die("Failed reading header: %s", strerror(errno));
 	SHA1_Update(&c, hdr, 8);
 
-fprintf(stderr, "%lu objects\n", object_count);
 	cnt = htonl(object_count);
 	SHA1_Update(&c, &cnt, 4);
 	if (ywrite(packfd, &cnt, 4) != 4)
@@ -163,16 +229,81 @@ fprintf(stderr, "%lu objects\n", object_count);
 	}
 	free(buf);
 
-	SHA1_Final(sha1, &c);
-	if (ywrite(packfd, sha1, sizeof(sha1)) != sizeof(sha1))
+	SHA1_Final(packsha1, &c);
+	if (ywrite(packfd, packsha1, sizeof(packsha1)) != sizeof(packsha1))
 		die("Failed writing pack checksum: %s", strerror(errno));
 }
 
-int main (int argc, const char **argv)
+static int oecmp (const void *_a, const void *_b)
 {
-	packfd = open(argv[1], O_RDWR|O_CREAT|O_TRUNC, 0666);
+	struct object_entry *a = *((struct object_entry**)_a);
+	struct object_entry *b = *((struct object_entry**)_b);
+	return memcmp(a->sha1, b->sha1, sizeof(a->sha1));
+}
+
+static void write_index(const char *idx_name)
+{
+	struct sha1file *f;
+	struct object_entry **idx, **c, **last;
+	struct object_entry *e;
+	struct overflow_object_entry *o;
+	unsigned int array[256];
+	int i;
+
+	/* Build the sorted table of object IDs. */
+	idx = xmalloc(object_count * sizeof(struct object_entry*));
+	c = idx;
+	for (e = pool_start; e != pool_next; e++)
+		*c++ = e;
+	for (o = overflow; o; o = o->next)
+		*c++ = &o->oe;
+	last = idx + object_count;
+	qsort(idx, object_count, sizeof(struct object_entry*), oecmp);
+
+	/* Generate the fan-out array. */
+	c = idx;
+	for (i = 0; i < 256; i++) {
+		struct object_entry **next = c;;
+		while (next < last) {
+			if ((*next)->sha1[0] != i)
+				break;
+			next++;
+		}
+		array[i] = htonl(next - idx);
+		c = next;
+	}
+
+	f = sha1create("%s", idx_name);
+	sha1write(f, array, 256 * sizeof(int));
+	for (c = idx; c != last; c++) {
+		unsigned int offset = htonl((*c)->offset);
+		sha1write(f, &offset, 4);
+		sha1write(f, (*c)->sha1, sizeof((*c)->sha1));
+	}
+	sha1write(f, packsha1, sizeof(packsha1));
+	sha1close(f, NULL, 1);
+	free(idx);
+}
+
+int main(int argc, const char **argv)
+{
+	const char *base_name = argv[1];
+	int est_obj_cnt = atoi(argv[2]);
+	char *pack_name;
+	char *idx_name;
+
+	pack_name = xmalloc(strlen(base_name) + 6);
+	sprintf(pack_name, "%s.pack", base_name);
+	idx_name = xmalloc(strlen(base_name) + 5);
+	sprintf(idx_name, "%s.idx", base_name);
+
+	packfd = open(pack_name, O_RDWR|O_CREAT|O_TRUNC, 0666);
 	if (packfd < 0)
-		die("Can't create pack file %s: %s", argv[1], strerror(errno));
+		die("Can't create pack file %s: %s", pack_name, strerror(errno));
+
+	pool_start = xmalloc(est_obj_cnt * sizeof(struct object_entry));
+	pool_next = pool_start;
+	pool_end = pool_start + est_obj_cnt;
 
 	init_pack_header();
 	for (;;) {
@@ -182,8 +313,10 @@ int main (int argc, const char **argv)
 		char hdr[128];
 		unsigned char sha1[20];
 		SHA_CTX c;
+		struct object_entry *e;
 
 		if (yread(0, &datlen, 4) != 4)
+
 			break;
 
 		dat = xmalloc(datlen);
@@ -196,19 +329,30 @@ int main (int argc, const char **argv)
 		SHA1_Update(&c, dat, datlen);
 		SHA1_Final(sha1, &c);
 
-		write_blob(dat, datlen);
-		object_count++;
-		printf("%s\n", sha1_to_hex(sha1));
-		fflush(stdout);
+		e = insert_object(sha1);
+		if (!e->offset) {
+			e->offset = packoff;
+			write_blob(dat, datlen);
+			object_count++;
+			printf("%s\n", sha1_to_hex(sha1));
+			fflush(stdout);
 
-		if (lastdat)
-			free(lastdat);
-		lastdat = dat;
-		lastdatlen = datlen;
-		memcpy(lastsha1, sha1, sizeof(sha1));
+			if (lastdat)
+				free(lastdat);
+			lastdat = dat;
+			lastdatlen = datlen;
+			memcpy(lastsha1, sha1, sizeof(sha1));
+		} else {
+			duplicate_count++;
+			free(dat);
+		}
 	}
 	fixup_header_footer();
 	close(packfd);
+	write_index(idx_name);
+
+	fprintf(stderr, "%lu objects, %lu duplicates, %lu pool overflow\n",
+		object_count, duplicate_count, overflow_count);
 
 	return 0;
 }

From 27d6d29035473f01ba5bb3b52c86ee4181d251fe Mon Sep 17 00:00:00 2001
From: "Shawn O. Pearce" <spearce@spearce.org>
Date: Tue, 8 Aug 2006 00:03:59 -0400
Subject: [PATCH 03/81] Cleaned up memory allocation for object_entry structs.

Although its easy to ask the user to tell us how many objects they
will need, its probably better to dynamically grow the object table
in large units.  But if the user can give us a hint as to roughly
how many objects then we can still use it during startup.

Also stopped printing the SHA1 strings to stdout as no user is
currently making use of that facility.

Signed-off-by: Shawn O. Pearce <spearce@spearce.org>
---
 fast-import.c | 97 +++++++++++++++++++++++++--------------------------
 1 file changed, 48 insertions(+), 49 deletions(-)

diff --git a/fast-import.c b/fast-import.c
index 0d95118499..3856c87c4e 100644
--- a/fast-import.c
+++ b/fast-import.c
@@ -6,18 +6,6 @@
 #include "pack.h"
 #include "csum-file.h"
 
-static int max_depth = 10;
-static unsigned long object_count;
-static unsigned long duplicate_count;
-static unsigned long packoff;
-static unsigned long overflow_count;
-static int packfd;
-static int current_depth;
-static void *lastdat;
-static unsigned long lastdatlen;
-static unsigned char lastsha1[20];
-static unsigned char packsha1[20];
-
 struct object_entry
 {
 	struct object_entry *next;
@@ -25,40 +13,57 @@ struct object_entry
 	unsigned char sha1[20];
 };
 
-struct overflow_object_entry
+struct object_entry_block
 {
-	struct overflow_object_entry *next;
-	struct object_entry oe;
+	struct object_entry_block *next_block;
+	struct object_entry *next_free;
+	struct object_entry *end;
+	struct object_entry entries[0];
 };
 
-struct object_entry *pool_start;
-struct object_entry *pool_next;
-struct object_entry *pool_end;
-struct overflow_object_entry *overflow;
-struct object_entry *table[1 << 16];
+static int max_depth = 10;
+static unsigned long alloc_count;
+static unsigned long object_count;
+static unsigned long duplicate_count;
+static unsigned long packoff;
+static int packfd;
+static int current_depth;
+static void *lastdat;
+static unsigned long lastdatlen;
+static unsigned char lastsha1[20];
+static unsigned char packsha1[20];
+struct object_entry *object_table[1 << 16];
+struct object_entry_block *blocks;
+
+static void alloc_objects(int cnt)
+{
+	struct object_entry_block *b;
+
+	b = xmalloc(sizeof(struct object_entry_block)
+		+ cnt * sizeof(struct object_entry));
+	b->next_block = blocks;
+	b->next_free = b->entries;
+	b->end = b->entries + cnt;
+	blocks = b;
+	alloc_count += cnt;
+}
 
 static struct object_entry* new_object(unsigned char *sha1)
 {
-	if (pool_next != pool_end) {
-		struct object_entry *e = pool_next++;
-		memcpy(e->sha1, sha1, sizeof(e->sha1));
-		return e;
-	} else {
-		struct overflow_object_entry *e;
+	struct object_entry *e;
 
-		e = xmalloc(sizeof(struct overflow_object_entry));
-		e->next = overflow;
-		memcpy(e->oe.sha1, sha1, sizeof(e->oe.sha1));
-		overflow = e;
-		overflow_count++;
-		return &e->oe;
-	}
+	if (blocks->next_free == blocks->end)
+		alloc_objects(1000);
+
+	e = blocks->next_free++;
+	memcpy(e->sha1, sha1, sizeof(e->sha1));
+	return e;
 }
 
 static struct object_entry* insert_object(unsigned char *sha1)
 {
 	unsigned int h = sha1[0] << 8 | sha1[1];
-	struct object_entry *e = table[h];
+	struct object_entry *e = object_table[h];
 	struct object_entry *p = 0;
 
 	while (e) {
@@ -74,7 +79,7 @@ static struct object_entry* insert_object(unsigned char *sha1)
 	if (p)
 		p->next = e;
 	else
-		table[h] = e;
+		object_table[h] = e;
 	return e;
 }
 
@@ -246,17 +251,16 @@ static void write_index(const char *idx_name)
 	struct sha1file *f;
 	struct object_entry **idx, **c, **last;
 	struct object_entry *e;
-	struct overflow_object_entry *o;
+	struct object_entry_block *o;
 	unsigned int array[256];
 	int i;
 
 	/* Build the sorted table of object IDs. */
 	idx = xmalloc(object_count * sizeof(struct object_entry*));
 	c = idx;
-	for (e = pool_start; e != pool_next; e++)
-		*c++ = e;
-	for (o = overflow; o; o = o->next)
-		*c++ = &o->oe;
+	for (o = blocks; o; o = o->next_block)
+		for (e = o->entries; e != o->next_free; e++)
+			*c++ = e;
 	last = idx + object_count;
 	qsort(idx, object_count, sizeof(struct object_entry*), oecmp);
 
@@ -297,14 +301,11 @@ int main(int argc, const char **argv)
 	idx_name = xmalloc(strlen(base_name) + 5);
 	sprintf(idx_name, "%s.idx", base_name);
 
-	packfd = open(pack_name, O_RDWR|O_CREAT|O_TRUNC, 0666);
+	packfd = open(pack_name, O_RDWR|O_CREAT|O_EXCL, 0666);
 	if (packfd < 0)
 		die("Can't create pack file %s: %s", pack_name, strerror(errno));
 
-	pool_start = xmalloc(est_obj_cnt * sizeof(struct object_entry));
-	pool_next = pool_start;
-	pool_end = pool_start + est_obj_cnt;
-
+	alloc_objects(est_obj_cnt);
 	init_pack_header();
 	for (;;) {
 		unsigned long datlen;
@@ -334,8 +335,6 @@ int main(int argc, const char **argv)
 			e->offset = packoff;
 			write_blob(dat, datlen);
 			object_count++;
-			printf("%s\n", sha1_to_hex(sha1));
-			fflush(stdout);
 
 			if (lastdat)
 				free(lastdat);
@@ -351,8 +350,8 @@ int main(int argc, const char **argv)
 	close(packfd);
 	write_index(idx_name);
 
-	fprintf(stderr, "%lu objects, %lu duplicates, %lu pool overflow\n",
-		object_count, duplicate_count, overflow_count);
+	fprintf(stderr, "%lu objects, %lu duplicates, %lu allocated (%lu overflow)\n",
+		object_count, duplicate_count, alloc_count, alloc_count - est_obj_cnt);
 
 	return 0;
 }

From ac47a738a7c866eeffc0c6374c0ef3f7ca6ee79d Mon Sep 17 00:00:00 2001
From: "Shawn O. Pearce" <spearce@spearce.org>
Date: Tue, 8 Aug 2006 00:46:13 -0400
Subject: [PATCH 04/81] Refactored fast-import's internals for future
 additions.

Too many globals variables were being used not not enough
code was resuable to process trees and commits so this is
a simple refactoring of the existing blob processing code
to get into a state that will be easier to handle trees
and commits in.

Signed-off-by: Shawn O. Pearce <spearce@spearce.org>
---
 fast-import.c | 149 ++++++++++++++++++++++++++++----------------------
 1 file changed, 83 insertions(+), 66 deletions(-)

diff --git a/fast-import.c b/fast-import.c
index 3856c87c4e..8b4be28f60 100644
--- a/fast-import.c
+++ b/fast-import.c
@@ -18,22 +18,34 @@ struct object_entry_block
 	struct object_entry_block *next_block;
 	struct object_entry *next_free;
 	struct object_entry *end;
-	struct object_entry entries[0];
+	struct object_entry entries[FLEX_ARRAY]; /* more */
 };
 
+struct last_object
+{
+	void *data;
+	unsigned long len;
+	int depth;
+	unsigned char sha1[20];
+};
+
+/* Stats and misc. counters. */
 static int max_depth = 10;
 static unsigned long alloc_count;
 static unsigned long object_count;
 static unsigned long duplicate_count;
-static unsigned long packoff;
-static int packfd;
-static int current_depth;
-static void *lastdat;
-static unsigned long lastdatlen;
-static unsigned char lastsha1[20];
-static unsigned char packsha1[20];
-struct object_entry *object_table[1 << 16];
+
+/* The .pack file */
+static int pack_fd;
+static unsigned long pack_offset;
+static unsigned char pack_sha1[20];
+
+/* Table of objects we've written. */
 struct object_entry_block *blocks;
+struct object_entry *object_table[1 << 16];
+
+/* Our last blob */
+struct last_object last_blob;
 
 static void alloc_objects(int cnt)
 {
@@ -115,7 +127,10 @@ static ssize_t ywrite(int fd, void *buffer, size_t length)
 	return ret;
 }
 
-static unsigned long encode_header(enum object_type type, unsigned long size, unsigned char *hdr)
+static unsigned long encode_header(
+	enum object_type type,
+	unsigned long size,
+	unsigned char *hdr)
 {
 	int n = 1;
 	unsigned char c;
@@ -135,41 +150,62 @@ static unsigned long encode_header(enum object_type type, unsigned long size, un
 	return n;
 }
 
-static void write_blob(void *dat, unsigned long datlen)
+static int store_object(
+	enum object_type type,
+	void *dat,
+	unsigned long datlen,
+	struct last_object *last)
 {
-	z_stream s;
 	void *out, *delta;
-	unsigned char hdr[64];
+	struct object_entry *e;
+	unsigned char hdr[96];
+	unsigned char sha1[20];
 	unsigned long hdrlen, deltalen;
+	SHA_CTX c;
+	z_stream s;
 
-	if (lastdat && current_depth < max_depth) {
-		delta = diff_delta(lastdat, lastdatlen,
+	hdrlen = sprintf((char*)hdr,"%s %lu",type_names[type],datlen) + 1;
+	SHA1_Init(&c);
+	SHA1_Update(&c, hdr, hdrlen);
+	SHA1_Update(&c, dat, datlen);
+	SHA1_Final(sha1, &c);
+
+	e = insert_object(sha1);
+	if (e->offset) {
+		duplicate_count++;
+		return 0;
+	}
+	e->offset = pack_offset;
+	object_count++;
+
+	if (last->data && last->depth < max_depth)
+		delta = diff_delta(last->data, last->len,
 			dat, datlen,
 			&deltalen, 0);
-	} else
+	else
 		delta = 0;
 
 	memset(&s, 0, sizeof(s));
 	deflateInit(&s, zlib_compression_level);
 
 	if (delta) {
-		current_depth++;
+		last->depth++;
 		s.next_in = delta;
 		s.avail_in = deltalen;
 		hdrlen = encode_header(OBJ_DELTA, deltalen, hdr);
-		if (ywrite(packfd, hdr, hdrlen) != hdrlen)
+		if (ywrite(pack_fd, hdr, hdrlen) != hdrlen)
 			die("Can't write object header: %s", strerror(errno));
-		if (ywrite(packfd, lastsha1, sizeof(lastsha1)) != sizeof(lastsha1))
+		if (ywrite(pack_fd, last->sha1, sizeof(sha1)) != sizeof(sha1))
 			die("Can't write object base: %s", strerror(errno));
-		packoff += hdrlen + sizeof(lastsha1);
+		pack_offset += hdrlen + sizeof(sha1);
 	} else {
-		current_depth = 0;
+		last->depth = 0;
 		s.next_in = dat;
 		s.avail_in = datlen;
-		hdrlen = encode_header(OBJ_BLOB, datlen, hdr);
-		if (ywrite(packfd, hdr, hdrlen) != hdrlen)
+		hdrlen = encode_header(type, datlen, hdr);
+		if (ywrite(pack_fd, hdr, hdrlen) != hdrlen)
 			die("Can't write object header: %s", strerror(errno));
-		packoff += hdrlen;
+		pack_offset += hdrlen;
 	}
 
 	s.avail_out = deflateBound(&s, s.avail_in);
@@ -178,13 +214,19 @@ static void write_blob(void *dat, unsigned long datlen)
 		/* nothing */;
 	deflateEnd(&s);
 
-	if (ywrite(packfd, out, s.total_out) != s.total_out)
+	if (ywrite(pack_fd, out, s.total_out) != s.total_out)
 		die("Failed writing compressed data %s", strerror(errno));
-	packoff += s.total_out;
+	pack_offset += s.total_out;
 
 	free(out);
 	if (delta)
 		free(delta);
+	if (last->data)
+		free(last->data);
+	last->data = dat;
+	last->len = datlen;
+	memcpy(last->sha1, sha1, sizeof(sha1));
+	return 1;
 }
 
 static void init_pack_header()
@@ -195,13 +237,13 @@ static void init_pack_header()
 
 	version = htonl(version);
 
-	if (ywrite(packfd, (char*)magic, 4) != 4)
+	if (ywrite(pack_fd, (char*)magic, 4) != 4)
 		die("Can't write pack magic: %s", strerror(errno));
-	if (ywrite(packfd, &version, 4) != 4)
+	if (ywrite(pack_fd, &version, 4) != 4)
 		die("Can't write pack version: %s", strerror(errno));
-	if (ywrite(packfd, &zero, 4) != 4)
+	if (ywrite(pack_fd, &zero, 4) != 4)
 		die("Can't write 0 object count: %s", strerror(errno));
-	packoff = 4 * 3;
+	pack_offset = 4 * 3;
 }
 
 static void fixup_header_footer()
@@ -212,30 +254,30 @@ static void fixup_header_footer()
 	char *buf;
 	size_t n;
 
-	if (lseek(packfd, 0, SEEK_SET) != 0)
+	if (lseek(pack_fd, 0, SEEK_SET) != 0)
 		die("Failed seeking to start: %s", strerror(errno));
 
 	SHA1_Init(&c);
-	if (yread(packfd, hdr, 8) != 8)
+	if (yread(pack_fd, hdr, 8) != 8)
 		die("Failed reading header: %s", strerror(errno));
 	SHA1_Update(&c, hdr, 8);
 
 	cnt = htonl(object_count);
 	SHA1_Update(&c, &cnt, 4);
-	if (ywrite(packfd, &cnt, 4) != 4)
+	if (ywrite(pack_fd, &cnt, 4) != 4)
 		die("Failed writing object count: %s", strerror(errno));
 
 	buf = xmalloc(128 * 1024);
 	for (;;) {
-		n = xread(packfd, buf, 128 * 1024);
+		n = xread(pack_fd, buf, 128 * 1024);
 		if (n <= 0)
 			break;
 		SHA1_Update(&c, buf, n);
 	}
 	free(buf);
 
-	SHA1_Final(packsha1, &c);
-	if (ywrite(packfd, packsha1, sizeof(packsha1)) != sizeof(packsha1))
+	SHA1_Final(pack_sha1, &c);
+	if (ywrite(pack_fd, pack_sha1, sizeof(pack_sha1)) != sizeof(pack_sha1))
 		die("Failed writing pack checksum: %s", strerror(errno));
 }
 
@@ -284,7 +326,7 @@ static void write_index(const char *idx_name)
 		sha1write(f, &offset, 4);
 		sha1write(f, (*c)->sha1, sizeof((*c)->sha1));
 	}
-	sha1write(f, packsha1, sizeof(packsha1));
+	sha1write(f, pack_sha1, sizeof(pack_sha1));
 	sha1close(f, NULL, 1);
 	free(idx);
 }
@@ -301,53 +343,28 @@ int main(int argc, const char **argv)
 	idx_name = xmalloc(strlen(base_name) + 5);
 	sprintf(idx_name, "%s.idx", base_name);
 
-	packfd = open(pack_name, O_RDWR|O_CREAT|O_EXCL, 0666);
-	if (packfd < 0)
+	pack_fd = open(pack_name, O_RDWR|O_CREAT|O_EXCL, 0666);
+	if (pack_fd < 0)
 		die("Can't create pack file %s: %s", pack_name, strerror(errno));
 
 	alloc_objects(est_obj_cnt);
 	init_pack_header();
 	for (;;) {
 		unsigned long datlen;
-		int hdrlen;
 		void *dat;
-		char hdr[128];
-		unsigned char sha1[20];
-		SHA_CTX c;
-		struct object_entry *e;
 
 		if (yread(0, &datlen, 4) != 4)
-
 			break;
 
 		dat = xmalloc(datlen);
 		if (yread(0, dat, datlen) != datlen)
 			break;
 
-		hdrlen = sprintf(hdr, "blob %lu", datlen) + 1;
-		SHA1_Init(&c);
-		SHA1_Update(&c, hdr, hdrlen);
-		SHA1_Update(&c, dat, datlen);
-		SHA1_Final(sha1, &c);
-
-		e = insert_object(sha1);
-		if (!e->offset) {
-			e->offset = packoff;
-			write_blob(dat, datlen);
-			object_count++;
-
-			if (lastdat)
-				free(lastdat);
-			lastdat = dat;
-			lastdatlen = datlen;
-			memcpy(lastsha1, sha1, sizeof(sha1));
-		} else {
-			duplicate_count++;
+		if (!store_object(OBJ_BLOB, dat, datlen, &last_blob))
 			free(dat);
-		}
 	}
 	fixup_header_footer();
-	close(packfd);
+	close(pack_fd);
 	write_index(idx_name);
 
 	fprintf(stderr, "%lu objects, %lu duplicates, %lu allocated (%lu overflow)\n",

From 6143f0644e79686407c1dc0e1b4dadff74e80046 Mon Sep 17 00:00:00 2001
From: "Shawn O. Pearce" <spearce@spearce.org>
Date: Tue, 8 Aug 2006 01:14:21 -0400
Subject: [PATCH 05/81] Added basic command handler to fast-import.

Moved the new_blob logic off into a new subroutine and
invoked it when getting the 'blob' command.

Added statistics dump to STDERR when the program terminates listing
what it did at a high level.  This is somewhat interesting.

Signed-off-by: Shawn O. Pearce <spearce@spearce.org>
---
 fast-import.c | 60 +++++++++++++++++++++++++++++++++++++++------------
 1 file changed, 46 insertions(+), 14 deletions(-)

diff --git a/fast-import.c b/fast-import.c
index 8b4be28f60..c9c48c5869 100644
--- a/fast-import.c
+++ b/fast-import.c
@@ -34,6 +34,8 @@ static int max_depth = 10;
 static unsigned long alloc_count;
 static unsigned long object_count;
 static unsigned long duplicate_count;
+static unsigned long object_count_by_type[9];
+static unsigned long duplicate_count_by_type[9];
 
 /* The .pack file */
 static int pack_fd;
@@ -173,10 +175,12 @@ static int store_object(
 	e = insert_object(sha1);
 	if (e->offset) {
 		duplicate_count++;
+		duplicate_count_by_type[type]++;
 		return 0;
 	}
 	e->offset = pack_offset;
 	object_count++;
+	object_count_by_type[type]++;
 
 	if (last->data && last->depth < max_depth)
 		delta = diff_delta(last->data, last->len,
@@ -232,7 +236,7 @@ static int store_object(
 static void init_pack_header()
 {
 	const char* magic = "PACK";
-	unsigned long version = 2;
+	unsigned long version = 3;
 	unsigned long zero = 0;
 
 	version = htonl(version);
@@ -331,12 +335,29 @@ static void write_index(const char *idx_name)
 	free(idx);
 }
 
+static void new_blob()
+{
+	unsigned long datlen;
+	void *dat;
+
+	if (yread(0, &datlen, 4) != 4)
+		die("Can't obtain blob length");
+
+	dat = xmalloc(datlen);
+	if (yread(0, dat, datlen) != datlen)
+		die("Con't obtain %lu bytes of blob data", datlen);
+
+	if (!store_object(OBJ_BLOB, dat, datlen, &last_blob))
+		free(dat);
+}
+
 int main(int argc, const char **argv)
 {
 	const char *base_name = argv[1];
 	int est_obj_cnt = atoi(argv[2]);
 	char *pack_name;
 	char *idx_name;
+	struct stat sb;
 
 	pack_name = xmalloc(strlen(base_name) + 6);
 	sprintf(pack_name, "%s.pack", base_name);
@@ -345,30 +366,41 @@ int main(int argc, const char **argv)
 
 	pack_fd = open(pack_name, O_RDWR|O_CREAT|O_EXCL, 0666);
 	if (pack_fd < 0)
-		die("Can't create pack file %s: %s", pack_name, strerror(errno));
+		die("Can't create %s: %s", pack_name, strerror(errno));
 
 	alloc_objects(est_obj_cnt);
 	init_pack_header();
 	for (;;) {
-		unsigned long datlen;
-		void *dat;
-
-		if (yread(0, &datlen, 4) != 4)
+		unsigned long cmd;
+		if (yread(0, &cmd, 4) != 4)
 			break;
 
-		dat = xmalloc(datlen);
-		if (yread(0, dat, datlen) != datlen)
-			break;
-
-		if (!store_object(OBJ_BLOB, dat, datlen, &last_blob))
-			free(dat);
+		switch (cmd) {
+		case 'blob': new_blob(); break;
+		default:
+			die("Invalid command %lu", cmd);
+		}
 	}
 	fixup_header_footer();
 	close(pack_fd);
 	write_index(idx_name);
 
-	fprintf(stderr, "%lu objects, %lu duplicates, %lu allocated (%lu overflow)\n",
-		object_count, duplicate_count, alloc_count, alloc_count - est_obj_cnt);
+	fprintf(stderr, "%s statistics:\n", argv[0]);
+	fprintf(stderr, "---------------------------------------------------\n");
+	fprintf(stderr, "Alloc'd objects: %10lu (%10lu overflow  )\n", alloc_count, alloc_count - est_obj_cnt);
+	fprintf(stderr, "Total objects:   %10lu (%10lu duplicates)\n", object_count, duplicate_count);
+	fprintf(stderr, "      blobs  :   %10lu (%10lu duplicates)\n", object_count_by_type[OBJ_BLOB], duplicate_count_by_type[OBJ_BLOB]);
+	fprintf(stderr, "      trees  :   %10lu (%10lu duplicates)\n", object_count_by_type[OBJ_TREE], duplicate_count_by_type[OBJ_TREE]);
+	fprintf(stderr, "      commits:   %10lu (%10lu duplicates)\n", object_count_by_type[OBJ_COMMIT], duplicate_count_by_type[OBJ_COMMIT]);
+	fprintf(stderr, "      tags   :   %10lu (%10lu duplicates)\n", object_count_by_type[OBJ_TAG], duplicate_count_by_type[OBJ_TAG]);
+	fprintf(stderr, "---------------------------------------------------\n");
+
+	stat(pack_name, &sb);
+	fprintf(stderr, "Pack size:       %10lu KiB\n", (unsigned long)(sb.st_size/1024));
+	stat(idx_name, &sb);
+	fprintf(stderr, "Index size:      %10lu KiB\n", (unsigned long)(sb.st_size/1024));
+
+	fprintf(stderr, "\n");
 
 	return 0;
 }

From 6bb5b3291df99bf050c91ab742b406d2404b8f73 Mon Sep 17 00:00:00 2001
From: "Shawn O. Pearce" <spearce@spearce.org>
Date: Tue, 8 Aug 2006 03:36:45 -0400
Subject: [PATCH 06/81] Implemented branch handling and basic tree support in
 fast-import.

This provides the basic data structures needed to store trees in
memory while we are processing them for a branch.  What we are
attempting to do is track one complete tree for each branch that
the frontend has registered with us through the 'newb' (new_branch)
command.  When the frontend edits that tree through 'updf' or 'delf'
commands we'll mark the affected tree(s) as being dirty and recompute
their objects during 'comt' (commit).

Currently the protocol is decidedly _not_ user friendly.  I crashed
fast-import by giving it bad input data from Perl.  I may try to
improve upon it, or at least upon its error handling.

Signed-off-by: Shawn O. Pearce <spearce@spearce.org>
---
 fast-import.c | 170 ++++++++++++++++++++++++++++++++++++++++++++++++--
 1 file changed, 165 insertions(+), 5 deletions(-)

diff --git a/fast-import.c b/fast-import.c
index c9c48c5869..98c5d1cbdd 100644
--- a/fast-import.c
+++ b/fast-import.c
@@ -24,14 +24,39 @@ struct object_entry_block
 struct last_object
 {
 	void *data;
-	unsigned long len;
-	int depth;
+	unsigned int len;
+	unsigned int depth;
 	unsigned char sha1[20];
 };
 
+struct tree;
+struct tree_entry
+{
+	struct tree *tree;
+	mode_t mode;
+	unsigned char sha1[20];
+	char name[FLEX_ARRAY]; /* more */
+};
+
+struct tree
+{
+	struct last_object last_tree;
+	unsigned long entry_count;
+	struct tree_entry **entries;
+};
+
+struct branch
+{
+	struct branch *next_branch;
+	struct tree_entry tree;
+	unsigned char sha1[20];
+	const char *name;
+};
+
 /* Stats and misc. counters. */
 static int max_depth = 10;
 static unsigned long alloc_count;
+static unsigned long branch_count;
 static unsigned long object_count;
 static unsigned long duplicate_count;
 static unsigned long object_count_by_type[9];
@@ -49,6 +74,10 @@ struct object_entry *object_table[1 << 16];
 /* Our last blob */
 struct last_object last_blob;
 
+/* Branch data */
+struct branch *branches;
+struct branch *current_branch;
+
 static void alloc_objects(int cnt)
 {
 	struct object_entry_block *b;
@@ -129,6 +158,32 @@ static ssize_t ywrite(int fd, void *buffer, size_t length)
 	return ret;
 }
 
+static const char* read_string()
+{
+	static char sn[PATH_MAX];
+	unsigned long slen;
+
+	if (yread(0, &slen, 4) != 4)
+		die("Can't obtain string");
+	if (!slen)
+		return 0;
+	if (slen > (PATH_MAX - 1))
+		die("Can't handle excessive string length %lu", slen);
+
+	if (yread(0, sn, slen) != slen)
+		die("Can't obtain string of length %lu", slen);
+	sn[slen] = 0;
+	return sn;
+}
+
+static const char* read_required_string()
+{
+	const char *r = read_string();
+	if (!r)
+		die("Expected string command parameter, didn't find one");
+	return r;
+}
+
 static unsigned long encode_header(
 	enum object_type type,
 	unsigned long size,
@@ -156,7 +211,8 @@ static int store_object(
 	enum object_type type,
 	void *dat,
 	unsigned long datlen,
-	struct last_object *last)
+	struct last_object *last,
+	unsigned char *sha1out)
 {
 	void *out, *delta;
 	struct object_entry *e;
@@ -171,6 +227,8 @@ static int store_object(
 	SHA1_Update(&c, hdr, hdrlen);
 	SHA1_Update(&c, dat, datlen);
 	SHA1_Final(sha1, &c);
+	if (sha1out)
+		memcpy(sha1out, sha1, sizeof(sha1));
 
 	e = insert_object(sha1);
 	if (e->offset) {
@@ -347,10 +405,108 @@ static void new_blob()
 	if (yread(0, dat, datlen) != datlen)
 		die("Con't obtain %lu bytes of blob data", datlen);
 
-	if (!store_object(OBJ_BLOB, dat, datlen, &last_blob))
+	if (!store_object(OBJ_BLOB, dat, datlen, &last_blob, 0))
 		free(dat);
 }
 
+static struct branch* lookup_branch(const char *name)
+{
+	struct branch *b;
+	for (b = branches; b; b = b->next_branch) {
+		if (!strcmp(name, b->name))
+			return b;
+	}
+	die("No branch named '%s' has been declared", name);
+}
+
+static struct tree* deep_copy_tree (struct tree *t)
+{
+	struct tree *r = xmalloc(sizeof(struct tree));
+	unsigned long i;
+
+	if (t->last_tree.data) {
+		r->last_tree.data = xmalloc(t->last_tree.len);
+		r->last_tree.len = t->last_tree.len;
+		r->last_tree.depth = t->last_tree.depth;
+		memcpy(r->last_tree.data, t->last_tree.data, t->last_tree.len);
+		memcpy(r->last_tree.sha1, t->last_tree.sha1, sizeof(t->last_tree.sha1));
+	}
+
+	r->entry_count = t->entry_count;
+	r->entries = xmalloc(t->entry_count * sizeof(struct tree_entry*));
+	for (i = 0; i < t->entry_count; i++) {
+		struct tree_entry *a = t->entries[i];
+		struct tree_entry *b;
+
+		b = xmalloc(sizeof(struct tree_entry) + strlen(a->name) + 1);
+		b->tree = a->tree ? deep_copy_tree(a->tree) : 0;
+		b->mode = a->mode;
+		memcpy(b->sha1, a->sha1, sizeof(a->sha1));
+		strcpy(b->name, a->name);
+		r->entries[i] = b;
+	}
+
+	return r;
+}
+
+static void store_tree (struct tree_entry *e)
+{
+	struct tree *t = e->tree;
+	unsigned long maxlen, i;
+	char *buf, *c;
+
+	if (memcmp(null_sha1, e->sha1, sizeof(e->sha1)))
+		return;
+
+	maxlen = t->entry_count * 32;
+	for (i = 0; i < t->entry_count; i++)
+		maxlen += strlen(t->entries[i]->name);
+
+	buf = c = xmalloc(maxlen);
+	for (i = 0; i < t->entry_count; i++) {
+		struct tree_entry *e = t->entries[i];
+		c += sprintf(c, "%o %s", e->mode, e->name) + 1;
+		if (e->tree)
+			store_tree(e);
+		memcpy(c, e->sha1, sizeof(e->sha1));
+		c += sizeof(e->sha1);
+	}
+
+	if (!store_object(OBJ_TREE, buf, c - buf, &t->last_tree, e->sha1))
+		free(buf);
+}
+
+static void new_branch()
+{
+	struct branch *nb = xcalloc(1, sizeof(struct branch));
+	const char *source_name;
+
+	nb->name = strdup(read_required_string());
+	source_name = read_string();
+	if (source_name) {
+		struct branch *sb = lookup_branch(source_name);
+		nb->tree.tree = deep_copy_tree(sb->tree.tree);
+		memcpy(nb->tree.sha1, sb->tree.sha1, sizeof(sb->tree.sha1));
+		memcpy(nb->sha1, sb->sha1, sizeof(sb->sha1));
+	} else {
+		nb->tree.tree = xcalloc(1, sizeof(struct tree));
+		nb->tree.tree->entries = xmalloc(8*sizeof(struct tree_entry*));
+	}
+	nb->next_branch = branches;
+	branches = nb;
+	branch_count++;
+}
+
+static void set_branch()
+{
+	current_branch = lookup_branch(read_required_string());
+}
+
+static void commit()
+{
+	store_tree(&current_branch->tree);
+}
+
 int main(int argc, const char **argv)
 {
 	const char *base_name = argv[1];
@@ -376,7 +532,10 @@ int main(int argc, const char **argv)
 			break;
 
 		switch (cmd) {
-		case 'blob': new_blob(); break;
+		case 'blob': new_blob();   break;
+		case 'newb': new_branch(); break;
+		case 'setb': set_branch(); break;
+		case 'comt': commit();     break;
 		default:
 			die("Invalid command %lu", cmd);
 		}
@@ -393,6 +552,7 @@ int main(int argc, const char **argv)
 	fprintf(stderr, "      trees  :   %10lu (%10lu duplicates)\n", object_count_by_type[OBJ_TREE], duplicate_count_by_type[OBJ_TREE]);
 	fprintf(stderr, "      commits:   %10lu (%10lu duplicates)\n", object_count_by_type[OBJ_COMMIT], duplicate_count_by_type[OBJ_COMMIT]);
 	fprintf(stderr, "      tags   :   %10lu (%10lu duplicates)\n", object_count_by_type[OBJ_TAG], duplicate_count_by_type[OBJ_TAG]);
+	fprintf(stderr, "Total branches:  %10lu\n", branch_count);
 	fprintf(stderr, "---------------------------------------------------\n");
 
 	stat(pack_name, &sb);

From 463acbe1c60fc5009dac9d033df6c2b9c5037a91 Mon Sep 17 00:00:00 2001
From: "Shawn O. Pearce" <spearce@spearce.org>
Date: Mon, 14 Aug 2006 00:58:19 -0400
Subject: [PATCH 07/81] Added tree and commit writing to fast-import.

The tree of the current commit can be altered by file_change commands
before the commit gets written to the pack.  The file changes are
rather primitive as they simply allow removal of a tree entry or
setting/adding a tree entry.

Currently trees and commits aren't being deltafied when written to
the pack and branch reloading from the current pack doesn't work,
so at most 5 branches can be worked with at any one time.

Signed-off-by: Shawn O. Pearce <spearce@spearce.org>
---
 fast-import.c | 914 ++++++++++++++++++++++++++++++++++++++++----------
 1 file changed, 733 insertions(+), 181 deletions(-)

diff --git a/fast-import.c b/fast-import.c
index 98c5d1cbdd..4605b7469b 100644
--- a/fast-import.c
+++ b/fast-import.c
@@ -1,9 +1,70 @@
+/*
+Format of STDIN stream:
+
+  stream ::= cmd*;
+
+  cmd ::= new_blob
+        | new_commit
+        | new_branch
+        | new_tag
+        ;
+
+  new_blob ::= 'blob' blob_data;
+
+  new_commit ::= 'comt' ref_name author_committer_msg
+    file_change*
+    '0';
+
+  new_branch ::= 'brch' dst_ref_name src_ref_name;
+  dst_ref_name ::= ref_name;
+  src_ref_name ::= ref_name | sha1_exp;
+
+  new_tag ::= 'tagg' ref_name tag_name tagger_msg;
+
+  file_change ::= 'M' path_name hexsha1
+                | 'D' path_name
+                ;
+
+  author_committer_msg ::= len32
+    'author' sp name '<' email '>' ts tz lf
+    'committer' sp name '<' email '>' ts tz lf
+    lf
+    binary_data;
+
+  tagger_msg ::= len32
+    'tagger' sp name '<' email '>' ts tz lf
+    lf
+    binary_data;
+
+  blob_data ::= len32 binary_data; # max len is 2^32-1
+  path_name ::= len32 path;        # max len is PATH_MAX-1
+  ref_name  ::= len32 ref;         # max len is PATH_MAX-1
+  tag_name  ::= len32 tag;         # max len is PATH_MAX-1
+  sha1_exp  ::= len32 sha1exp;     # max len is PATH_MAX-1
+
+  len32 ::= # unsigned 32 bit value, native format;
+  binary_data ::= # file content, not interpreted;
+  sp ::= # ASCII space character;
+  lf ::= # ASCII newline (LF) character;
+  path ::= # GIT style file path, e.g. "a/b/c";
+  ref ::= # GIT ref name, e.g. "refs/heads/MOZ_GECKO_EXPERIMENT";
+  tag ::= # GIT tag name, e.g. "FIREFOX_1_5";
+  sha1exp ::= # Any valid GIT SHA1 expression;
+  hexsha1 ::= # SHA1 in hexadecimal format;
+  name ::= # valid GIT author/committer name;
+  email ::= # valid GIT author/committer email;
+  ts ::= # time since the epoch in seconds, ascii decimal;
+  tz ::= # GIT style timezone;
+*/
+
 #include "builtin.h"
 #include "cache.h"
 #include "object.h"
 #include "blob.h"
+#include "tree.h"
 #include "delta.h"
 #include "pack.h"
+#include "refs.h"
 #include "csum-file.h"
 
 struct object_entry
@@ -13,9 +74,9 @@ struct object_entry
 	unsigned char sha1[20];
 };
 
-struct object_entry_block
+struct object_entry_pool
 {
-	struct object_entry_block *next_block;
+	struct object_entry_pool *next_pool;
 	struct object_entry *next_free;
 	struct object_entry *end;
 	struct object_entry entries[FLEX_ARRAY]; /* more */
@@ -29,31 +90,55 @@ struct last_object
 	unsigned char sha1[20];
 };
 
-struct tree;
-struct tree_entry
+struct mem_pool
 {
-	struct tree *tree;
-	mode_t mode;
-	unsigned char sha1[20];
-	char name[FLEX_ARRAY]; /* more */
+	struct mem_pool *next_pool;
+	char *next_free;
+	char *end;
+	char space[FLEX_ARRAY]; /* more */
 };
 
-struct tree
+struct atom_str
 {
-	struct last_object last_tree;
-	unsigned long entry_count;
-	struct tree_entry **entries;
+	struct atom_str *next_atom;
+	int str_len;
+	char str_dat[FLEX_ARRAY]; /* more */
+};
+
+struct tree_content;
+struct tree_entry
+{
+	struct tree_content *tree;
+	struct atom_str* name;
+	unsigned int mode;
+	unsigned char sha1[20];
+};
+
+struct tree_content
+{
+	unsigned int entry_capacity; /* must match avail_tree_content */
+	unsigned int entry_count;
+	struct tree_entry *entries[FLEX_ARRAY]; /* more */
+};
+
+struct avail_tree_content
+{
+	unsigned int entry_capacity; /* must match tree_content */
+	struct avail_tree_content *next_avail;
 };
 
 struct branch
 {
-	struct branch *next_branch;
-	struct tree_entry tree;
-	unsigned char sha1[20];
+	struct branch *table_next_branch;
+	struct branch *active_next_branch;
 	const char *name;
+	unsigned long last_commit;
+	struct tree_entry branch_tree;
+	unsigned char sha1[20];
 };
 
-/* Stats and misc. counters. */
+
+/* Stats and misc. counters */
 static int max_depth = 10;
 static unsigned long alloc_count;
 static unsigned long branch_count;
@@ -62,29 +147,50 @@ static unsigned long duplicate_count;
 static unsigned long object_count_by_type[9];
 static unsigned long duplicate_count_by_type[9];
 
-/* The .pack file */
+/* Memory pools */
+static size_t mem_pool_alloc = 2*1024*1024 - sizeof(struct mem_pool);
+static size_t total_allocd;
+static struct mem_pool *mem_pool;
+
+/* atom management */
+static unsigned int atom_table_sz = 4451;
+static unsigned int atom_cnt;
+static struct atom_str **atom_table;
+
+/* The .pack file being generated */
 static int pack_fd;
 static unsigned long pack_offset;
 static unsigned char pack_sha1[20];
 
 /* Table of objects we've written. */
-struct object_entry_block *blocks;
-struct object_entry *object_table[1 << 16];
+static unsigned int object_entry_alloc = 1000;
+static struct object_entry_pool *blocks;
+static struct object_entry *object_table[1 << 16];
 
 /* Our last blob */
-struct last_object last_blob;
+static struct last_object last_blob;
+
+/* Tree management */
+static unsigned int tree_entry_alloc = 1000;
+static void *avail_tree_entry;
+static unsigned int avail_tree_table_sz = 100;
+static struct avail_tree_content **avail_tree_table;
 
 /* Branch data */
-struct branch *branches;
-struct branch *current_branch;
+static unsigned int max_active_branches = 5;
+static unsigned int cur_active_branches;
+static unsigned int branch_table_sz = 1039;
+static struct branch **branch_table;
+static struct branch *active_branches;
+
 
 static void alloc_objects(int cnt)
 {
-	struct object_entry_block *b;
+	struct object_entry_pool *b;
 
-	b = xmalloc(sizeof(struct object_entry_block)
+	b = xmalloc(sizeof(struct object_entry_pool)
 		+ cnt * sizeof(struct object_entry));
-	b->next_block = blocks;
+	b->next_pool = blocks;
 	b->next_free = b->entries;
 	b->end = b->entries + cnt;
 	blocks = b;
@@ -96,18 +202,28 @@ static struct object_entry* new_object(unsigned char *sha1)
 	struct object_entry *e;
 
 	if (blocks->next_free == blocks->end)
-		alloc_objects(1000);
+		alloc_objects(object_entry_alloc);
 
 	e = blocks->next_free++;
 	memcpy(e->sha1, sha1, sizeof(e->sha1));
 	return e;
 }
 
+static struct object_entry* find_object(unsigned char *sha1)
+{
+	unsigned int h = sha1[0] << 8 | sha1[1];
+	struct object_entry *e;
+	for (e = object_table[h]; e; e = e->next)
+		if (!memcmp(sha1, e->sha1, sizeof(e->sha1)))
+			return e;
+	return NULL;
+}
+
 static struct object_entry* insert_object(unsigned char *sha1)
 {
 	unsigned int h = sha1[0] << 8 | sha1[1];
 	struct object_entry *e = object_table[h];
-	struct object_entry *p = 0;
+	struct object_entry *p = NULL;
 
 	while (e) {
 		if (!memcmp(sha1, e->sha1, sizeof(e->sha1)))
@@ -117,7 +233,7 @@ static struct object_entry* insert_object(unsigned char *sha1)
 	}
 
 	e = new_object(sha1);
-	e->next = 0;
+	e->next = NULL;
 	e->offset = 0;
 	if (p)
 		p->next = e;
@@ -126,64 +242,240 @@ static struct object_entry* insert_object(unsigned char *sha1)
 	return e;
 }
 
-static ssize_t yread(int fd, void *buffer, size_t length)
+static unsigned int hc_str(const char *s, size_t len)
+{
+	unsigned int r = 0;
+	while (len-- > 0)
+		r = r * 31 + *s++;
+	return r;
+}
+
+static void* pool_alloc(size_t len)
+{
+	struct mem_pool *p;
+	void *r;
+
+	for (p = mem_pool; p; p = p->next_pool)
+		if ((p->end - p->next_free >= len))
+			break;
+
+	if (!p) {
+		if (len >= (mem_pool_alloc/2)) {
+			total_allocd += len;
+			return xmalloc(len);
+		}
+		total_allocd += sizeof(struct mem_pool) + mem_pool_alloc;
+		p = xmalloc(sizeof(struct mem_pool) + mem_pool_alloc);
+		p->next_pool = mem_pool;
+		p->next_free = p->space;
+		p->end = p->next_free + mem_pool_alloc;
+		mem_pool = p;
+	}
+
+	r = p->next_free;
+	p->next_free += len;
+	return r;
+}
+
+static void* pool_calloc(size_t count, size_t size)
+{
+	size_t len = count * size;
+	void *r = pool_alloc(len);
+	memset(r, 0, len);
+	return r;
+}
+
+static char* pool_strdup(const char *s)
+{
+	char *r = pool_alloc(strlen(s) + 1);
+	strcpy(r, s);
+	return r;
+}
+
+static struct atom_str* to_atom(const char *s, size_t len)
+{
+	unsigned int hc = hc_str(s, len) % atom_table_sz;
+	struct atom_str *c;
+
+	for (c = atom_table[hc]; c; c = c->next_atom)
+		if (c->str_len == len && !strncmp(s, c->str_dat, len))
+			return c;
+
+	c = pool_alloc(sizeof(struct atom_str) + len + 1);
+	c->str_len = len;
+	strncpy(c->str_dat, s, len);
+	c->str_dat[len] = 0;
+	c->next_atom = atom_table[hc];
+	atom_table[hc] = c;
+	atom_cnt++;
+	return c;
+}
+
+static struct branch* lookup_branch(const char *name)
+{
+	unsigned int hc = hc_str(name, strlen(name)) % branch_table_sz;
+	struct branch *b;
+
+	for (b = branch_table[hc]; b; b = b->table_next_branch)
+		if (!strcmp(name, b->name))
+			return b;
+	return NULL;
+}
+
+static struct branch* new_branch(const char *name)
+{
+	unsigned int hc = hc_str(name, strlen(name)) % branch_table_sz;
+	struct branch* b = lookup_branch(name);
+
+	if (b)
+		die("Invalid attempt to create duplicate branch: %s", name);
+
+	b = pool_calloc(1, sizeof(struct branch));
+	b->name = pool_strdup(name);
+	b->table_next_branch = branch_table[hc];
+	branch_table[hc] = b;
+	branch_count++;
+	return b;
+}
+
+static unsigned int hc_entries(unsigned int cnt)
+{
+	cnt = cnt & 7 ? (cnt / 8) + 1 : cnt / 8;
+	return cnt < avail_tree_table_sz ? cnt : avail_tree_table_sz - 1;
+}
+
+static struct tree_content* new_tree_content(unsigned int cnt)
+{
+	struct avail_tree_content *f, *l = NULL;
+	struct tree_content *t;
+	unsigned int hc = hc_entries(cnt);
+
+	for (f = avail_tree_table[hc]; f; l = f, f = f->next_avail)
+		if (f->entry_capacity >= cnt)
+			break;
+
+	if (f) {
+		if (l)
+			l->next_avail = f->next_avail;
+		else
+			avail_tree_table[hc] = f->next_avail;
+	} else {
+		cnt = cnt & 7 ? ((cnt / 8) + 1) * 8 : cnt;
+		f = pool_alloc(sizeof(*t) + sizeof(t->entries[0]) * cnt);
+		f->entry_capacity = cnt;
+	}
+
+	t = (struct tree_content*)f;
+	t->entry_count = 0;
+	return t;
+}
+
+static void release_tree_entry(struct tree_entry *e);
+static void release_tree_content(struct tree_content *t)
+{
+	struct avail_tree_content *f = (struct avail_tree_content*)t;
+	unsigned int hc = hc_entries(f->entry_capacity);
+	unsigned int i;
+	for (i = 0; i < t->entry_count; i++)
+		release_tree_entry(t->entries[i]);
+	f->next_avail = avail_tree_table[hc];
+	avail_tree_table[hc] = f;
+}
+
+static struct tree_content* grow_tree_content(
+	struct tree_content *t,
+	int amt)
+{
+	struct tree_content *r = new_tree_content(t->entry_count + amt);
+	r->entry_count = t->entry_count;
+	memcpy(r->entries,t->entries,t->entry_count*sizeof(t->entries[0]));
+	release_tree_content(t);
+	return r;
+}
+
+static struct tree_entry* new_tree_entry()
+{
+	struct tree_entry *e;
+
+	if (!avail_tree_entry) {
+		unsigned int n = tree_entry_alloc;
+		avail_tree_entry = e = xmalloc(n * sizeof(struct tree_entry));
+		while (n--) {
+			*((void**)e) = e + 1;
+			e++;
+		}
+	}
+
+	e = avail_tree_entry;
+	avail_tree_entry = *((void**)e);
+	return e;
+}
+
+static void release_tree_entry(struct tree_entry *e)
+{
+	if (e->tree)
+		release_tree_content(e->tree);
+	*((void**)e) = avail_tree_entry;
+	avail_tree_entry = e;
+}
+
+static void yread(int fd, void *buffer, size_t length)
 {
 	ssize_t ret = 0;
 	while (ret < length) {
 		ssize_t size = xread(fd, (char *) buffer + ret, length - ret);
-		if (size < 0) {
-			return size;
-		}
-		if (size == 0) {
-			return ret;
-		}
+		if (!size)
+			die("Read from descriptor %i: end of stream", fd);
+		if (size < 0)
+			die("Read from descriptor %i: %s", fd, strerror(errno));
 		ret += size;
 	}
-	return ret;
 }
 
-static ssize_t ywrite(int fd, void *buffer, size_t length)
+static int optional_read(int fd, void *buffer, size_t length)
+{
+	ssize_t ret = 0;
+	while (ret < length) {
+		ssize_t size = xread(fd, (char *) buffer + ret, length - ret);
+		if (!size && !ret)
+			return 1;
+		if (!size)
+			die("Read from descriptor %i: end of stream", fd);
+		if (size < 0)
+			die("Read from descriptor %i: %s", fd, strerror(errno));
+		ret += size;
+	}
+	return 0;
+}
+
+static void ywrite(int fd, void *buffer, size_t length)
 {
 	ssize_t ret = 0;
 	while (ret < length) {
 		ssize_t size = xwrite(fd, (char *) buffer + ret, length - ret);
-		if (size < 0) {
-			return size;
-		}
-		if (size == 0) {
-			return ret;
-		}
+		if (!size)
+			die("Write to descriptor %i: end of file", fd);
+		if (size < 0)
+			die("Write to descriptor %i: %s", fd, strerror(errno));
 		ret += size;
 	}
-	return ret;
 }
 
-static const char* read_string()
+static const char* read_path()
 {
 	static char sn[PATH_MAX];
 	unsigned long slen;
 
-	if (yread(0, &slen, 4) != 4)
-		die("Can't obtain string");
+	yread(0, &slen, 4);
 	if (!slen)
-		return 0;
+		die("Expected string command parameter, didn't find one");
 	if (slen > (PATH_MAX - 1))
 		die("Can't handle excessive string length %lu", slen);
-
-	if (yread(0, sn, slen) != slen)
-		die("Can't obtain string of length %lu", slen);
+	yread(0, sn, slen);
 	sn[slen] = 0;
 	return sn;
 }
 
-static const char* read_required_string()
-{
-	const char *r = read_string();
-	if (!r)
-		die("Expected string command parameter, didn't find one");
-	return r;
-}
-
 static unsigned long encode_header(
 	enum object_type type,
 	unsigned long size,
@@ -234,13 +526,13 @@ static int store_object(
 	if (e->offset) {
 		duplicate_count++;
 		duplicate_count_by_type[type]++;
-		return 0;
+		return 1;
 	}
 	e->offset = pack_offset;
 	object_count++;
 	object_count_by_type[type]++;
 
-	if (last->data && last->depth < max_depth)
+	if (last && last->data && last->depth < max_depth)
 		delta = diff_delta(last->data, last->len,
 			dat, datlen,
 			&deltalen, 0);
@@ -255,18 +547,16 @@ static int store_object(
 		s.next_in = delta;
 		s.avail_in = deltalen;
 		hdrlen = encode_header(OBJ_DELTA, deltalen, hdr);
-		if (ywrite(pack_fd, hdr, hdrlen) != hdrlen)
-			die("Can't write object header: %s", strerror(errno));
-		if (ywrite(pack_fd, last->sha1, sizeof(sha1)) != sizeof(sha1))
-			die("Can't write object base: %s", strerror(errno));
+		ywrite(pack_fd, hdr, hdrlen);
+		ywrite(pack_fd, last->sha1, sizeof(sha1));
 		pack_offset += hdrlen + sizeof(sha1);
 	} else {
-		last->depth = 0;
+		if (last)
+			last->depth = 0;
 		s.next_in = dat;
 		s.avail_in = datlen;
 		hdrlen = encode_header(type, datlen, hdr);
-		if (ywrite(pack_fd, hdr, hdrlen) != hdrlen)
-			die("Can't write object header: %s", strerror(errno));
+		ywrite(pack_fd, hdr, hdrlen);
 		pack_offset += hdrlen;
 	}
 
@@ -276,18 +566,220 @@ static int store_object(
 		/* nothing */;
 	deflateEnd(&s);
 
-	if (ywrite(pack_fd, out, s.total_out) != s.total_out)
-		die("Failed writing compressed data %s", strerror(errno));
+	ywrite(pack_fd, out, s.total_out);
 	pack_offset += s.total_out;
 
 	free(out);
 	if (delta)
 		free(delta);
-	if (last->data)
-		free(last->data);
-	last->data = dat;
-	last->len = datlen;
-	memcpy(last->sha1, sha1, sizeof(sha1));
+	if (last) {
+		if (last->data)
+			free(last->data);
+		last->data = dat;
+		last->len = datlen;
+		memcpy(last->sha1, sha1, sizeof(sha1));
+	}
+	return 0;
+}
+
+static const char *get_mode(const char *str, unsigned int *modep)
+{
+	unsigned char c;
+	unsigned int mode = 0;
+
+	while ((c = *str++) != ' ') {
+		if (c < '0' || c > '7')
+			return NULL;
+		mode = (mode << 3) + (c - '0');
+	}
+	*modep = mode;
+	return str;
+}
+
+static void load_tree(struct tree_entry *root)
+{
+	struct object_entry *myoe;
+	struct tree_content *t;
+	unsigned long size;
+	char *buf;
+	const char *c;
+	char type[20];
+
+	root->tree = t = new_tree_content(8);
+	if (!memcmp(root->sha1, null_sha1, 20))
+		return;
+
+	myoe = find_object(root->sha1);
+	if (myoe) {
+		die("FIXME");
+	} else {
+		buf = read_sha1_file(root->sha1, type, &size);
+		if (!buf || strcmp(type, tree_type))
+			die("Can't load existing tree %s", sha1_to_hex(root->sha1));
+	}
+
+	c = buf;
+	while (c != (buf + size)) {
+		struct tree_entry *e = new_tree_entry();
+
+		if (t->entry_count == t->entry_capacity)
+			root->tree = t = grow_tree_content(t, 8);
+		t->entries[t->entry_count++] = e;
+
+		e->tree = NULL;
+		c = get_mode(c, &e->mode);
+		if (!c)
+			die("Corrupt mode in %s", sha1_to_hex(root->sha1));
+		e->name = to_atom(c, strlen(c));
+		c += e->name->str_len + 1;
+		memcpy(e->sha1, c, sizeof(e->sha1));
+		c += 20;
+	}
+	free(buf);
+}
+
+static int tecmp (const void *_a, const void *_b)
+{
+	struct tree_entry *a = *((struct tree_entry**)_a);
+	struct tree_entry *b = *((struct tree_entry**)_b);
+	return base_name_compare(
+		a->name->str_dat, a->name->str_len, a->mode,
+		b->name->str_dat, b->name->str_len, b->mode);
+}
+
+static void store_tree(struct tree_entry *root)
+{
+	struct tree_content *t = root->tree;
+	unsigned int i;
+	size_t maxlen;
+	char *buf, *c;
+
+	if (memcmp(root->sha1, null_sha1, 20))
+		return;
+
+	maxlen = 0;
+	for (i = 0; i < t->entry_count; i++) {
+		maxlen += t->entries[i]->name->str_len + 34;
+		if (t->entries[i]->tree)
+			store_tree(t->entries[i]);
+	}
+
+	qsort(t->entries, t->entry_count, sizeof(t->entries[0]), tecmp);
+	buf = c = xmalloc(maxlen);
+	for (i = 0; i < t->entry_count; i++) {
+		struct tree_entry *e = t->entries[i];
+		c += sprintf(c, "%o", e->mode);
+		*c++ = ' ';
+		strcpy(c, e->name->str_dat);
+		c += e->name->str_len + 1;
+		memcpy(c, e->sha1, 20);
+		c += 20;
+	}
+	store_object(OBJ_TREE, buf, c - buf, NULL, root->sha1);
+	free(buf);
+}
+
+static int tree_content_set(
+	struct tree_entry *root,
+	const char *p,
+	const unsigned char *sha1,
+	const unsigned int mode)
+{
+	struct tree_content *t = root->tree;
+	const char *slash1;
+	unsigned int i, n;
+	struct tree_entry *e;
+
+	slash1 = strchr(p, '/');
+	if (slash1)
+		n = slash1 - p;
+	else
+		n = strlen(p);
+
+	for (i = 0; i < t->entry_count; i++) {
+		e = t->entries[i];
+		if (e->name->str_len == n && !strncmp(p, e->name->str_dat, n)) {
+			if (!slash1) {
+				if (e->mode == mode && !memcmp(e->sha1, sha1, 20))
+					return 0;
+				e->mode = mode;
+				memcpy(e->sha1, sha1, 20);
+				if (e->tree) {
+					release_tree_content(e->tree);
+					e->tree = NULL;
+				}
+				memcpy(root->sha1, null_sha1, 20);
+				return 1;
+			}
+			if (!S_ISDIR(e->mode)) {
+				e->tree = new_tree_content(8);
+				e->mode = 040000;
+			}
+			if (!e->tree)
+				load_tree(e);
+			if (tree_content_set(e, slash1 + 1, sha1, mode)) {
+				memcpy(root->sha1, null_sha1, 20);
+				return 1;
+			}
+			return 0;
+		}
+	}
+
+	if (t->entry_count == t->entry_capacity)
+		root->tree = t = grow_tree_content(t, 8);
+	e = new_tree_entry();
+	e->name = to_atom(p, n);
+	t->entries[t->entry_count++] = e;
+	if (slash1) {
+		e->tree = new_tree_content(8);
+		e->mode = 040000;
+		tree_content_set(e, slash1 + 1, sha1, mode);
+	} else {
+		e->tree = NULL;
+		e->mode = mode;
+		memcpy(e->sha1, sha1, 20);
+	}
+	memcpy(root->sha1, null_sha1, 20);
+	return 1;
+}
+
+static int tree_content_remove(struct tree_entry *root, const char *p)
+{
+	struct tree_content *t = root->tree;
+	const char *slash1;
+	unsigned int i, n;
+	struct tree_entry *e;
+
+	slash1 = strchr(p, '/');
+	if (slash1)
+		n = slash1 - p;
+	else
+		n = strlen(p);
+
+	for (i = 0; i < t->entry_count; i++) {
+		e = t->entries[i];
+		if (e->name->str_len == n && !strncmp(p, e->name->str_dat, n)) {
+			if (!slash1 || !S_ISDIR(e->mode))
+				goto del_entry;
+			if (!e->tree)
+				load_tree(e);
+			if (tree_content_remove(e, slash1 + 1)) {
+				if (!e->tree->entry_count)
+					goto del_entry;
+				memcpy(root->sha1, null_sha1, 20);
+				return 1;
+			}
+			return 0;
+		}
+	}
+	return 0;
+
+del_entry:
+	for (i++; i < t->entry_count; i++)
+		t->entries[i-1] = t->entries[i];
+	t->entry_count--;
+	release_tree_entry(e);
+	memcpy(root->sha1, null_sha1, 20);
 	return 1;
 }
 
@@ -298,13 +790,9 @@ static void init_pack_header()
 	unsigned long zero = 0;
 
 	version = htonl(version);
-
-	if (ywrite(pack_fd, (char*)magic, 4) != 4)
-		die("Can't write pack magic: %s", strerror(errno));
-	if (ywrite(pack_fd, &version, 4) != 4)
-		die("Can't write pack version: %s", strerror(errno));
-	if (ywrite(pack_fd, &zero, 4) != 4)
-		die("Can't write 0 object count: %s", strerror(errno));
+	ywrite(pack_fd, (char*)magic, 4);
+	ywrite(pack_fd, &version, 4);
+	ywrite(pack_fd, &zero, 4);
 	pack_offset = 4 * 3;
 }
 
@@ -320,14 +808,12 @@ static void fixup_header_footer()
 		die("Failed seeking to start: %s", strerror(errno));
 
 	SHA1_Init(&c);
-	if (yread(pack_fd, hdr, 8) != 8)
-		die("Failed reading header: %s", strerror(errno));
+	yread(pack_fd, hdr, 8);
 	SHA1_Update(&c, hdr, 8);
 
 	cnt = htonl(object_count);
 	SHA1_Update(&c, &cnt, 4);
-	if (ywrite(pack_fd, &cnt, 4) != 4)
-		die("Failed writing object count: %s", strerror(errno));
+	ywrite(pack_fd, &cnt, 4);
 
 	buf = xmalloc(128 * 1024);
 	for (;;) {
@@ -339,8 +825,7 @@ static void fixup_header_footer()
 	free(buf);
 
 	SHA1_Final(pack_sha1, &c);
-	if (ywrite(pack_fd, pack_sha1, sizeof(pack_sha1)) != sizeof(pack_sha1))
-		die("Failed writing pack checksum: %s", strerror(errno));
+	ywrite(pack_fd, pack_sha1, sizeof(pack_sha1));
 }
 
 static int oecmp (const void *_a, const void *_b)
@@ -355,14 +840,14 @@ static void write_index(const char *idx_name)
 	struct sha1file *f;
 	struct object_entry **idx, **c, **last;
 	struct object_entry *e;
-	struct object_entry_block *o;
+	struct object_entry_pool *o;
 	unsigned int array[256];
 	int i;
 
 	/* Build the sorted table of object IDs. */
 	idx = xmalloc(object_count * sizeof(struct object_entry*));
 	c = idx;
-	for (o = blocks; o; o = o->next_block)
+	for (o = blocks; o; o = o->next_pool)
 		for (e = o->entries; e != o->next_free; e++)
 			*c++ = e;
 	last = idx + object_count;
@@ -393,118 +878,175 @@ static void write_index(const char *idx_name)
 	free(idx);
 }
 
-static void new_blob()
+static void dump_branches()
+{
+	static const char *msg = "fast-import";
+	unsigned int i;
+	struct branch *b;
+	struct ref_lock *lock;
+
+	for (i = 0; i < branch_table_sz; i++) {
+		for (b = branch_table[i]; b; b = b->table_next_branch) {
+			lock = lock_any_ref_for_update(b->name, NULL, 0);
+			if (!lock || write_ref_sha1(lock, b->sha1, msg) < 0)
+				die("Can't write %s", b->name);
+		}
+	}
+}
+
+static void cmd_new_blob()
 {
 	unsigned long datlen;
+	unsigned char sha1[20];
 	void *dat;
 
-	if (yread(0, &datlen, 4) != 4)
-		die("Can't obtain blob length");
-
+	yread(0, &datlen, 4);
 	dat = xmalloc(datlen);
-	if (yread(0, dat, datlen) != datlen)
-		die("Con't obtain %lu bytes of blob data", datlen);
-
-	if (!store_object(OBJ_BLOB, dat, datlen, &last_blob, 0))
+	yread(0, dat, datlen);
+	if (store_object(OBJ_BLOB, dat, datlen, &last_blob, sha1))
 		free(dat);
 }
 
-static struct branch* lookup_branch(const char *name)
+static void unload_one_branch()
 {
-	struct branch *b;
-	for (b = branches; b; b = b->next_branch) {
-		if (!strcmp(name, b->name))
-			return b;
+	while (cur_active_branches >= max_active_branches) {
+		unsigned long min_commit = ULONG_MAX;
+		struct branch *e, *l = NULL, *p = NULL;
+
+		for (e = active_branches; e; e = e->active_next_branch) {
+			if (e->last_commit < min_commit) {
+				p = l;
+				min_commit = e->last_commit;
+			}
+			l = e;
+		}
+
+		if (p) {
+			e = p->active_next_branch;
+			p->active_next_branch = e->active_next_branch;
+		} else {
+			e = active_branches;
+			active_branches = e->active_next_branch;
+		}
+		e->active_next_branch = NULL;
+		if (e->branch_tree.tree) {
+			release_tree_content(e->branch_tree.tree);
+			e->branch_tree.tree = NULL;
+		}
+		cur_active_branches--;
 	}
-	die("No branch named '%s' has been declared", name);
 }
 
-static struct tree* deep_copy_tree (struct tree *t)
+static void load_branch(struct branch *b)
 {
-	struct tree *r = xmalloc(sizeof(struct tree));
-	unsigned long i;
-
-	if (t->last_tree.data) {
-		r->last_tree.data = xmalloc(t->last_tree.len);
-		r->last_tree.len = t->last_tree.len;
-		r->last_tree.depth = t->last_tree.depth;
-		memcpy(r->last_tree.data, t->last_tree.data, t->last_tree.len);
-		memcpy(r->last_tree.sha1, t->last_tree.sha1, sizeof(t->last_tree.sha1));
-	}
-
-	r->entry_count = t->entry_count;
-	r->entries = xmalloc(t->entry_count * sizeof(struct tree_entry*));
-	for (i = 0; i < t->entry_count; i++) {
-		struct tree_entry *a = t->entries[i];
-		struct tree_entry *b;
-
-		b = xmalloc(sizeof(struct tree_entry) + strlen(a->name) + 1);
-		b->tree = a->tree ? deep_copy_tree(a->tree) : 0;
-		b->mode = a->mode;
-		memcpy(b->sha1, a->sha1, sizeof(a->sha1));
-		strcpy(b->name, a->name);
-		r->entries[i] = b;
-	}
-
-	return r;
+	load_tree(&b->branch_tree);
+	b->active_next_branch = active_branches;
+	active_branches = b;
+	cur_active_branches++;
 }
 
-static void store_tree (struct tree_entry *e)
+static void file_change_m(struct branch *b)
 {
-	struct tree *t = e->tree;
-	unsigned long maxlen, i;
-	char *buf, *c;
+	const char *path = read_path();
+	char hexsha1[41];
+	unsigned char sha1[20];
 
-	if (memcmp(null_sha1, e->sha1, sizeof(e->sha1)))
-		return;
+	yread(0, hexsha1, 40);
+	hexsha1[40] = 0;
 
-	maxlen = t->entry_count * 32;
-	for (i = 0; i < t->entry_count; i++)
-		maxlen += strlen(t->entries[i]->name);
+	if (get_sha1_hex(hexsha1, sha1))
+		die("Invalid sha1 %s for %s", hexsha1, path);
 
-	buf = c = xmalloc(maxlen);
-	for (i = 0; i < t->entry_count; i++) {
-		struct tree_entry *e = t->entries[i];
-		c += sprintf(c, "%o %s", e->mode, e->name) + 1;
-		if (e->tree)
-			store_tree(e);
-		memcpy(c, e->sha1, sizeof(e->sha1));
-		c += sizeof(e->sha1);
+	tree_content_set(&b->branch_tree, path, sha1, 0100644);
+}
+
+static void file_change_d(struct branch *b)
+{
+	tree_content_remove(&b->branch_tree, read_path());
+}
+
+static void cmd_new_commit()
+{
+	static const unsigned int max_hdr_len = 94;
+	const char *name = read_path();
+	struct branch *b = lookup_branch(name);
+	unsigned int acmsglen;
+	char *body, *c;
+
+	if (!b)
+		die("Branch not declared: %s", name);
+	if (!b->branch_tree.tree) {
+		unload_one_branch();
+		load_branch(b);
 	}
 
-	if (!store_object(OBJ_TREE, buf, c - buf, &t->last_tree, e->sha1))
-		free(buf);
-}
+	/* author_committer_msg */
+	yread(0, &acmsglen, 4);
+	body = xmalloc(acmsglen + max_hdr_len);
+	c = body + max_hdr_len;
+	yread(0, c, acmsglen);
 
-static void new_branch()
-{
-	struct branch *nb = xcalloc(1, sizeof(struct branch));
-	const char *source_name;
-
-	nb->name = strdup(read_required_string());
-	source_name = read_string();
-	if (source_name) {
-		struct branch *sb = lookup_branch(source_name);
-		nb->tree.tree = deep_copy_tree(sb->tree.tree);
-		memcpy(nb->tree.sha1, sb->tree.sha1, sizeof(sb->tree.sha1));
-		memcpy(nb->sha1, sb->sha1, sizeof(sb->sha1));
-	} else {
-		nb->tree.tree = xcalloc(1, sizeof(struct tree));
-		nb->tree.tree->entries = xmalloc(8*sizeof(struct tree_entry*));
+	/* file_change* */
+	for (;;) {
+		unsigned char cmd;
+		yread(0, &cmd, 1);
+		if (cmd == '0')
+			break;
+		else if (cmd == 'M')
+			file_change_m(b);
+		else if (cmd == 'D')
+			file_change_d(b);
+		else
+			die("Unsupported file_change: %c", cmd);
 	}
-	nb->next_branch = branches;
-	branches = nb;
-	branch_count++;
+
+	if (memcmp(b->sha1, null_sha1, 20)) {
+		sprintf(c - 48, "parent %s", sha1_to_hex(b->sha1));
+		*(c - 1) = '\n';
+		c -= 48;
+	}
+	store_tree(&b->branch_tree);
+	sprintf(c - 46, "tree %s", sha1_to_hex(b->branch_tree.sha1));
+	*(c - 1) = '\n';
+	c -= 46;
+
+	store_object(OBJ_COMMIT,
+		c, (body + max_hdr_len + acmsglen) - c,
+		NULL, b->sha1);
+	free(body);
+	b->last_commit = object_count_by_type[OBJ_COMMIT];
 }
 
-static void set_branch()
+static void cmd_new_branch()
 {
-	current_branch = lookup_branch(read_required_string());
-}
+	struct branch *b = new_branch(read_path());
+	const char *base = read_path();
+	struct branch *s = lookup_branch(base);
 
-static void commit()
-{
-	store_tree(&current_branch->tree);
+	if (!strcmp(b->name, base))
+		die("Can't create a branch from itself: %s", base);
+	else if (s) {
+		memcpy(b->sha1, s->sha1, 20);
+		memcpy(b->branch_tree.sha1, s->branch_tree.sha1, 20);
+	}
+	else if (!get_sha1(base, b->sha1)) {
+		if (!memcmp(b->sha1, null_sha1, 20))
+			memcpy(b->branch_tree.sha1, null_sha1, 20);
+		else {
+			unsigned long size;
+			char *buf;
+
+			buf = read_object_with_reference(b->sha1,
+				type_names[OBJ_COMMIT], &size, b->sha1);
+			if (!buf || size < 46)
+				die("Not a valid commit: %s", base);
+			if (memcmp("tree ", buf, 5)
+				|| get_sha1_hex(buf + 5, b->branch_tree.sha1))
+				die("The commit %s is corrupt", sha1_to_hex(b->sha1));
+			free(buf);
+		}
+	} else
+		die("Not a SHA1 or branch: %s", base);
 }
 
 int main(int argc, const char **argv)
@@ -515,6 +1057,9 @@ int main(int argc, const char **argv)
 	char *idx_name;
 	struct stat sb;
 
+	setup_ident();
+	git_config(git_default_config);
+
 	pack_name = xmalloc(strlen(base_name) + 6);
 	sprintf(pack_name, "%s.pack", base_name);
 	idx_name = xmalloc(strlen(base_name) + 5);
@@ -525,17 +1070,21 @@ int main(int argc, const char **argv)
 		die("Can't create %s: %s", pack_name, strerror(errno));
 
 	alloc_objects(est_obj_cnt);
+
+	atom_table = xcalloc(atom_table_sz, sizeof(struct atom_str*));
+	branch_table = xcalloc(branch_table_sz, sizeof(struct branch*));
+	avail_tree_table = xcalloc(avail_tree_table_sz, sizeof(struct avail_tree_content*));
+
 	init_pack_header();
 	for (;;) {
 		unsigned long cmd;
-		if (yread(0, &cmd, 4) != 4)
+		if (optional_read(0, &cmd, 4))
 			break;
 
-		switch (cmd) {
-		case 'blob': new_blob();   break;
-		case 'newb': new_branch(); break;
-		case 'setb': set_branch(); break;
-		case 'comt': commit();     break;
+		switch (ntohl(cmd)) {
+		case 'blob': cmd_new_blob();   break;
+		case 'comt': cmd_new_commit(); break;
+		case 'brch': cmd_new_branch(); break;
 		default:
 			die("Invalid command %lu", cmd);
 		}
@@ -543,6 +1092,7 @@ int main(int argc, const char **argv)
 	fixup_header_footer();
 	close(pack_fd);
 	write_index(idx_name);
+	dump_branches();
 
 	fprintf(stderr, "%s statistics:\n", argv[0]);
 	fprintf(stderr, "---------------------------------------------------\n");
@@ -553,6 +1103,8 @@ int main(int argc, const char **argv)
 	fprintf(stderr, "      commits:   %10lu (%10lu duplicates)\n", object_count_by_type[OBJ_COMMIT], duplicate_count_by_type[OBJ_COMMIT]);
 	fprintf(stderr, "      tags   :   %10lu (%10lu duplicates)\n", object_count_by_type[OBJ_TAG], duplicate_count_by_type[OBJ_TAG]);
 	fprintf(stderr, "Total branches:  %10lu\n", branch_count);
+	fprintf(stderr, "Total atoms:     %10u\n", atom_cnt);
+	fprintf(stderr, "Memory pools:    %10lu MiB\n", total_allocd/(1024*1024));
 	fprintf(stderr, "---------------------------------------------------\n");
 
 	stat(pack_name, &sb);

From 7111feede9c5905199ba48645fadc369faca5711 Mon Sep 17 00:00:00 2001
From: "Shawn O. Pearce" <spearce@spearce.org>
Date: Mon, 14 Aug 2006 02:50:18 -0400
Subject: [PATCH 08/81] Implement blob ID validation in fast-import.

When accepting revision SHA1 IDs from the frontend verify the SHA1
actually refers to a blob and is known to exist.  Its an error
to use a SHA1 in a tree if the blob doesn't exist as this would
cause git-fsck-objects to report a missing blob should the pack get
closed without the blob being appended into it or a subsequent pack.
So right now we'll just ask that the frontend "pre-declare" any
blobs it wants to use in a tree before it can use them.

Signed-off-by: Shawn O. Pearce <spearce@spearce.org>
---
 fast-import.c | 28 ++++++++++++++++++++++++----
 1 file changed, 24 insertions(+), 4 deletions(-)

diff --git a/fast-import.c b/fast-import.c
index 4605b7469b..95b84f57e5 100644
--- a/fast-import.c
+++ b/fast-import.c
@@ -70,6 +70,7 @@ Format of STDIN stream:
 struct object_entry
 {
 	struct object_entry *next;
+	enum object_type type;
 	unsigned long offset;
 	unsigned char sha1[20];
 };
@@ -528,6 +529,7 @@ static int store_object(
 		duplicate_count_by_type[type]++;
 		return 1;
 	}
+	e->type = type;
 	e->offset = pack_offset;
 	object_count++;
 	object_count_by_type[type]++;
@@ -713,7 +715,7 @@ static int tree_content_set(
 			}
 			if (!S_ISDIR(e->mode)) {
 				e->tree = new_tree_content(8);
-				e->mode = 040000;
+				e->mode = S_IFDIR;
 			}
 			if (!e->tree)
 				load_tree(e);
@@ -732,7 +734,7 @@ static int tree_content_set(
 	t->entries[t->entry_count++] = e;
 	if (slash1) {
 		e->tree = new_tree_content(8);
-		e->mode = 040000;
+		e->mode = S_IFDIR;
 		tree_content_set(e, slash1 + 1, sha1, mode);
 	} else {
 		e->tree = NULL;
@@ -948,16 +950,28 @@ static void load_branch(struct branch *b)
 static void file_change_m(struct branch *b)
 {
 	const char *path = read_path();
+	struct object_entry *oe;
 	char hexsha1[41];
 	unsigned char sha1[20];
+	char type[20];
 
 	yread(0, hexsha1, 40);
 	hexsha1[40] = 0;
 
 	if (get_sha1_hex(hexsha1, sha1))
 		die("Invalid sha1 %s for %s", hexsha1, path);
+	oe = find_object(sha1);
+	if (oe) {
+		if (oe->type != OBJ_BLOB)
+			die("%s is a %s not a blob (for %s)", hexsha1, type_names[oe->type], path);
+	} else {
+		if (sha1_object_info(sha1, type, NULL))
+			die("No blob %s for %s", hexsha1, path);
+		if (strcmp(blob_type, type))
+			die("%s is a %s not a blob (for %s)", hexsha1, type, path);
+	}
 
-	tree_content_set(&b->branch_tree, path, sha1, 0100644);
+	tree_content_set(&b->branch_tree, path, sha1, S_IFREG | 0644);
 }
 
 static void file_change_d(struct branch *b)
@@ -986,6 +1000,10 @@ static void cmd_new_commit()
 	c = body + max_hdr_len;
 	yread(0, c, acmsglen);
 
+	/* oddly enough this is all that fsck-objects cares about */
+	if (memcmp(c, "author ", 7))
+		die("Invalid commit format on branch %s", name);
+
 	/* file_change* */
 	for (;;) {
 		unsigned char cmd;
@@ -1104,7 +1122,9 @@ int main(int argc, const char **argv)
 	fprintf(stderr, "      tags   :   %10lu (%10lu duplicates)\n", object_count_by_type[OBJ_TAG], duplicate_count_by_type[OBJ_TAG]);
 	fprintf(stderr, "Total branches:  %10lu\n", branch_count);
 	fprintf(stderr, "Total atoms:     %10u\n", atom_cnt);
-	fprintf(stderr, "Memory pools:    %10lu MiB\n", total_allocd/(1024*1024));
+	fprintf(stderr, "Memory total:    %10lu KiB\n", (total_allocd + alloc_count*sizeof(struct object_entry))/1024);
+	fprintf(stderr, "       pools:    %10lu KiB\n", total_allocd/1024);
+	fprintf(stderr, "     objects:    %10lu KiB\n", (alloc_count*sizeof(struct object_entry))/1024);
 	fprintf(stderr, "---------------------------------------------------\n");
 
 	stat(pack_name, &sb);

From c44cdc7eef212ec09901eb2e0996476e0468ed88 Mon Sep 17 00:00:00 2001
From: "Shawn O. Pearce" <spearce@spearce.org>
Date: Mon, 14 Aug 2006 20:16:28 -0400
Subject: [PATCH 09/81] Converted fast-import to a text based protocol.

Frontend clients can now send a text stream to fast-import rather
than a binary stream.  This should facilitate developing frontend
software as the data stream is easier to view, manipulate and debug
my hand and Mark-I eyeball.

Signed-off-by: Shawn O. Pearce <spearce@spearce.org>
---
 fast-import.c | 470 ++++++++++++++++++++++++++++++++++----------------
 1 file changed, 318 insertions(+), 152 deletions(-)

diff --git a/fast-import.c b/fast-import.c
index 95b84f57e5..2953e80cde 100644
--- a/fast-import.c
+++ b/fast-import.c
@@ -4,57 +4,93 @@ Format of STDIN stream:
   stream ::= cmd*;
 
   cmd ::= new_blob
-        | new_commit
         | new_branch
+        | new_commit
         | new_tag
         ;
 
-  new_blob ::= 'blob' blob_data;
+  new_blob ::= 'blob' lf
+	mark?
+    file_content;
+  file_content ::= data;
 
-  new_commit ::= 'comt' ref_name author_committer_msg
+  new_branch ::= 'branch' sp ref_str lf
+    ('from' sp (ref_str | hexsha1 | sha1exp_str | idnum) lf)?
+    lf;
+
+  new_commit ::= 'commit' sp ref_str lf
+	mark?
+	('author' sp name '<' email '>' ts tz lf)?
+	'committer' sp name '<' email '>' ts tz lf
+	commit_msg
     file_change*
-    '0';
+    lf;
+  commit_msg ::= data;
 
-  new_branch ::= 'brch' dst_ref_name src_ref_name;
-  dst_ref_name ::= ref_name;
-  src_ref_name ::= ref_name | sha1_exp;
-
-  new_tag ::= 'tagg' ref_name tag_name tagger_msg;
-
-  file_change ::= 'M' path_name hexsha1
-                | 'D' path_name
+  file_change ::= 'M' sp mode sp (hexsha1 | idnum) sp path_str lf
+                | 'D' sp path_str lf
                 ;
+  mode ::= '644' | '755';
 
-  author_committer_msg ::= len32
-    'author' sp name '<' email '>' ts tz lf
-    'committer' sp name '<' email '>' ts tz lf
-    lf
-    binary_data;
+  new_tag ::= 'tag' sp tag_str lf
+    'from' sp (ref_str | hexsha1 | sha1exp_str | idnum) lf
+	'tagger' sp name '<' email '>' ts tz lf
+    tag_msg;
+  tag_msg ::= data;
 
-  tagger_msg ::= len32
-    'tagger' sp name '<' email '>' ts tz lf
-    lf
-    binary_data;
+     # note: the first idnum in a stream should be 1 and subsequent
+     # idnums should not have gaps between values as this will cause
+     # the stream parser to reserve space for the gapped values.  An
+	 # idnum can be updated in the future to a new object by issuing
+     # a new mark directive with the old idnum.
+	 #
+  mark ::= 'mark' sp idnum lf;
 
-  blob_data ::= len32 binary_data; # max len is 2^32-1
-  path_name ::= len32 path;        # max len is PATH_MAX-1
-  ref_name  ::= len32 ref;         # max len is PATH_MAX-1
-  tag_name  ::= len32 tag;         # max len is PATH_MAX-1
-  sha1_exp  ::= len32 sha1exp;     # max len is PATH_MAX-1
+     # note: declen indicates the length of binary_data in bytes.
+     # declen does not include the lf preceeding or trailing the
+     # binary data.
+     #
+  data ::= 'data' sp declen lf
+    binary_data
+	lf;
 
-  len32 ::= # unsigned 32 bit value, native format;
+     # note: quoted strings are C-style quoting supporting \c for
+     # common escapes of 'c' (e..g \n, \t, \\, \") or \nnn where nnn
+	 # is the signed byte value in octal.  Note that the only
+     # characters which must actually be escaped to protect the
+     # stream formatting is: \, " and LF.  Otherwise these values
+	 # are UTF8.
+     #
+  ref_str     ::= ref     | '"' quoted(ref)     '"' ;
+  sha1exp_str ::= sha1exp | '"' quoted(sha1exp) '"' ;
+  tag_str     ::= tag     | '"' quoted(tag)     '"' ;
+  path_str    ::= path    | '"' quoted(path)    '"' ;
+
+  declen ::= # unsigned 32 bit value, ascii base10 notation;
   binary_data ::= # file content, not interpreted;
+
   sp ::= # ASCII space character;
   lf ::= # ASCII newline (LF) character;
-  path ::= # GIT style file path, e.g. "a/b/c";
-  ref ::= # GIT ref name, e.g. "refs/heads/MOZ_GECKO_EXPERIMENT";
-  tag ::= # GIT tag name, e.g. "FIREFOX_1_5";
+
+     # note: a colon (':') must precede the numerical value assigned to
+	 # an idnum.  This is to distinguish it from a ref or tag name as
+     # GIT does not permit ':' in ref or tag strings.
+	 #
+  idnum   ::= ':' declen;
+  path    ::= # GIT style file path, e.g. "a/b/c";
+  ref     ::= # GIT ref name, e.g. "refs/heads/MOZ_GECKO_EXPERIMENT";
+  tag     ::= # GIT tag name, e.g. "FIREFOX_1_5";
   sha1exp ::= # Any valid GIT SHA1 expression;
   hexsha1 ::= # SHA1 in hexadecimal format;
-  name ::= # valid GIT author/committer name;
+
+     # note: name and email are UTF8 strings, however name must not
+	 # contain '<' or lf and email must not contain any of the
+     # following: '<', '>', lf.
+	 #
+  name  ::= # valid GIT author/committer name;
   email ::= # valid GIT author/committer email;
-  ts ::= # time since the epoch in seconds, ascii decimal;
-  tz ::= # GIT style timezone;
+  ts    ::= # time since the epoch in seconds, ascii base10 notation;
+  tz    ::= # GIT style timezone;
 */
 
 #include "builtin.h"
@@ -66,6 +102,8 @@ Format of STDIN stream:
 #include "pack.h"
 #include "refs.h"
 #include "csum-file.h"
+#include "strbuf.h"
+#include "quote.h"
 
 struct object_entry
 {
@@ -153,7 +191,7 @@ static size_t mem_pool_alloc = 2*1024*1024 - sizeof(struct mem_pool);
 static size_t total_allocd;
 static struct mem_pool *mem_pool;
 
-/* atom management */
+/* Atom management */
 static unsigned int atom_table_sz = 4451;
 static unsigned int atom_cnt;
 static struct atom_str **atom_table;
@@ -184,6 +222,10 @@ static unsigned int branch_table_sz = 1039;
 static struct branch **branch_table;
 static struct branch *active_branches;
 
+/* Input stream parsing */
+static struct strbuf command_buf;
+static unsigned long command_mark;
+
 
 static void alloc_objects(int cnt)
 {
@@ -330,6 +372,8 @@ static struct branch* new_branch(const char *name)
 
 	if (b)
 		die("Invalid attempt to create duplicate branch: %s", name);
+	if (check_ref_format(name))
+		die("Branch name doesn't conform to GIT standards: %s", name);
 
 	b = pool_calloc(1, sizeof(struct branch));
 	b->name = pool_strdup(name);
@@ -433,22 +477,6 @@ static void yread(int fd, void *buffer, size_t length)
 	}
 }
 
-static int optional_read(int fd, void *buffer, size_t length)
-{
-	ssize_t ret = 0;
-	while (ret < length) {
-		ssize_t size = xread(fd, (char *) buffer + ret, length - ret);
-		if (!size && !ret)
-			return 1;
-		if (!size)
-			die("Read from descriptor %i: end of stream", fd);
-		if (size < 0)
-			die("Read from descriptor %i: %s", fd, strerror(errno));
-		ret += size;
-	}
-	return 0;
-}
-
 static void ywrite(int fd, void *buffer, size_t length)
 {
 	ssize_t ret = 0;
@@ -462,24 +490,9 @@ static void ywrite(int fd, void *buffer, size_t length)
 	}
 }
 
-static const char* read_path()
-{
-	static char sn[PATH_MAX];
-	unsigned long slen;
-
-	yread(0, &slen, 4);
-	if (!slen)
-		die("Expected string command parameter, didn't find one");
-	if (slen > (PATH_MAX - 1))
-		die("Can't handle excessive string length %lu", slen);
-	yread(0, sn, slen);
-	sn[slen] = 0;
-	return sn;
-}
-
-static unsigned long encode_header(
+static size_t encode_header(
 	enum object_type type,
-	unsigned long size,
+	size_t size,
 	unsigned char *hdr)
 {
 	int n = 1;
@@ -503,7 +516,7 @@ static unsigned long encode_header(
 static int store_object(
 	enum object_type type,
 	void *dat,
-	unsigned long datlen,
+	size_t datlen,
 	struct last_object *last,
 	unsigned char *sha1out)
 {
@@ -896,15 +909,57 @@ static void dump_branches()
 	}
 }
 
+static void read_next_command()
+{
+	read_line(&command_buf, stdin, '\n');
+}
+
+static void cmd_mark()
+{
+	if (!strncmp("mark :", command_buf.buf, 6)) {
+		command_mark = strtoul(command_buf.buf + 6, NULL, 10);
+		read_next_command();
+	}
+	else
+		command_mark = 0;
+}
+
+static void* cmd_data (size_t *size)
+{
+	size_t n = 0;
+	void *buffer;
+	size_t length;
+
+	if (strncmp("data ", command_buf.buf, 5))
+		die("Expected 'data n' command, found: %s", command_buf.buf);
+
+	length = strtoul(command_buf.buf + 5, NULL, 10);
+	buffer = xmalloc(length);
+
+	while (n < length) {
+		size_t s = fread((char*)buffer + n, 1, length - n, stdin);
+		if (!s && feof(stdin))
+			die("EOF in data (%lu bytes remaining)", length - n);
+		n += s;
+	}
+
+	if (fgetc(stdin) != '\n')
+		die("An lf did not trail the binary data as expected.");
+
+	*size = length;
+	return buffer;
+}
+
 static void cmd_new_blob()
 {
-	unsigned long datlen;
-	unsigned char sha1[20];
+	size_t datlen;
 	void *dat;
+	unsigned char sha1[20];
+
+	read_next_command();
+	cmd_mark();
+	dat = cmd_data(&datlen);
 
-	yread(0, &datlen, 4);
-	dat = xmalloc(datlen);
-	yread(0, dat, datlen);
 	if (store_object(OBJ_BLOB, dat, datlen, &last_blob, sha1))
 		free(dat);
 }
@@ -949,122 +1004,231 @@ static void load_branch(struct branch *b)
 
 static void file_change_m(struct branch *b)
 {
-	const char *path = read_path();
+	const char *p = command_buf.buf + 2;
+	char *p_uq;
+	const char *endp;
 	struct object_entry *oe;
-	char hexsha1[41];
 	unsigned char sha1[20];
+	unsigned int mode;
 	char type[20];
 
-	yread(0, hexsha1, 40);
-	hexsha1[40] = 0;
+	p = get_mode(p, &mode);
+	if (!p)
+		die("Corrupt mode: %s", command_buf.buf);
+	switch (mode) {
+	case S_IFREG | 0644:
+	case S_IFREG | 0755:
+	case 0644:
+	case 0755:
+		/* ok */
+		break;
+	default:
+		die("Corrupt mode: %s", command_buf.buf);
+	}
+
+	if (get_sha1_hex(p, sha1))
+		die("Invalid SHA1: %s", command_buf.buf);
+	p += 40;
+	if (*p++ != ' ')
+		die("Missing space after SHA1: %s", command_buf.buf);
+
+	p_uq = unquote_c_style(p, &endp);
+	if (p_uq) {
+		if (*endp)
+			die("Garbage after path in: %s", command_buf.buf);
+		p = p_uq;
+	}
 
-	if (get_sha1_hex(hexsha1, sha1))
-		die("Invalid sha1 %s for %s", hexsha1, path);
 	oe = find_object(sha1);
 	if (oe) {
 		if (oe->type != OBJ_BLOB)
-			die("%s is a %s not a blob (for %s)", hexsha1, type_names[oe->type], path);
+			die("Not a blob (actually a %s): %s",
+				command_buf.buf, type_names[oe->type]);
 	} else {
 		if (sha1_object_info(sha1, type, NULL))
-			die("No blob %s for %s", hexsha1, path);
+			die("Blob not found: %s", command_buf.buf);
 		if (strcmp(blob_type, type))
-			die("%s is a %s not a blob (for %s)", hexsha1, type, path);
+			die("Not a blob (actually a %s): %s",
+				command_buf.buf, type);
 	}
 
-	tree_content_set(&b->branch_tree, path, sha1, S_IFREG | 0644);
+	tree_content_set(&b->branch_tree, p, sha1, S_IFREG | mode);
+
+	if (p_uq)
+		free(p_uq);
 }
 
 static void file_change_d(struct branch *b)
 {
-	tree_content_remove(&b->branch_tree, read_path());
+	const char *p = command_buf.buf + 2;
+	char *p_uq;
+	const char *endp;
+
+	p_uq = unquote_c_style(p, &endp);
+	if (p_uq) {
+		if (*endp)
+			die("Garbage after path in: %s", command_buf.buf);
+		p = p_uq;
+	}
+	tree_content_remove(&b->branch_tree, p);
+	if (p_uq)
+		free(p_uq);
 }
 
 static void cmd_new_commit()
 {
-	static const unsigned int max_hdr_len = 94;
-	const char *name = read_path();
-	struct branch *b = lookup_branch(name);
-	unsigned int acmsglen;
-	char *body, *c;
+	struct branch *b;
+	void *msg;
+	size_t msglen;
+	char *str_uq;
+	const char *endp;
+	char *sp;
+	char *author = NULL;
+	char *committer = NULL;
+	char *body;
 
+	/* Obtain the branch name from the rest of our command */
+	sp = strchr(command_buf.buf, ' ') + 1;
+	str_uq = unquote_c_style(sp, &endp);
+	if (str_uq) {
+		if (*endp)
+			die("Garbage after ref in: %s", command_buf.buf);
+		sp = str_uq;
+	}
+	b = lookup_branch(sp);
 	if (!b)
-		die("Branch not declared: %s", name);
+		die("Branch not declared: %s", sp);
+	if (str_uq)
+		free(str_uq);
+
+	read_next_command();
+	cmd_mark();
+	if (!strncmp("author ", command_buf.buf, 7)) {
+		author = strdup(command_buf.buf);
+		read_next_command();
+	}
+	if (!strncmp("committer ", command_buf.buf, 10)) {
+		committer = strdup(command_buf.buf);
+		read_next_command();
+	}
+	if (!committer)
+		die("Expected committer but didn't get one");
+	msg = cmd_data(&msglen);
+
+	/* ensure the branch is active/loaded */
 	if (!b->branch_tree.tree) {
 		unload_one_branch();
 		load_branch(b);
 	}
 
-	/* author_committer_msg */
-	yread(0, &acmsglen, 4);
-	body = xmalloc(acmsglen + max_hdr_len);
-	c = body + max_hdr_len;
-	yread(0, c, acmsglen);
-
-	/* oddly enough this is all that fsck-objects cares about */
-	if (memcmp(c, "author ", 7))
-		die("Invalid commit format on branch %s", name);
-
 	/* file_change* */
 	for (;;) {
-		unsigned char cmd;
-		yread(0, &cmd, 1);
-		if (cmd == '0')
+		read_next_command();
+		if (1 == command_buf.len)
 			break;
-		else if (cmd == 'M')
+		else if (!strncmp("M ", command_buf.buf, 2))
 			file_change_m(b);
-		else if (cmd == 'D')
+		else if (!strncmp("D ", command_buf.buf, 2))
 			file_change_d(b);
 		else
-			die("Unsupported file_change: %c", cmd);
+			die("Unsupported file_change: %s", command_buf.buf);
 	}
 
-	if (memcmp(b->sha1, null_sha1, 20)) {
-		sprintf(c - 48, "parent %s", sha1_to_hex(b->sha1));
-		*(c - 1) = '\n';
-		c -= 48;
-	}
+	/* build the tree and the commit */
 	store_tree(&b->branch_tree);
-	sprintf(c - 46, "tree %s", sha1_to_hex(b->branch_tree.sha1));
-	*(c - 1) = '\n';
-	c -= 46;
+	body = xmalloc(97 + msglen
+		+ (author
+			? strlen(author) + strlen(committer)
+			: 2 * strlen(committer)));
+	sp = body;
+	sp += sprintf(sp, "tree %s\n", sha1_to_hex(b->branch_tree.sha1));
+	if (memcmp(b->sha1, null_sha1, 20))
+		sp += sprintf(sp, "parent %s\n", sha1_to_hex(b->sha1));
+	if (author)
+		sp += sprintf(sp, "%s\n", author);
+	else
+		sp += sprintf(sp, "author %s\n", committer + 10);
+	sp += sprintf(sp, "%s\n\n", committer);
+	memcpy(sp, msg, msglen);
+	sp += msglen;
+	if (author)
+		free(author);
+	free(committer);
+	free(msg);
 
-	store_object(OBJ_COMMIT,
-		c, (body + max_hdr_len + acmsglen) - c,
-		NULL, b->sha1);
+	store_object(OBJ_COMMIT, body, sp - body, NULL, b->sha1);
 	free(body);
 	b->last_commit = object_count_by_type[OBJ_COMMIT];
 }
 
 static void cmd_new_branch()
 {
-	struct branch *b = new_branch(read_path());
-	const char *base = read_path();
-	struct branch *s = lookup_branch(base);
+	struct branch *b;
+	char *str_uq;
+	const char *endp;
+	char *sp;
 
-	if (!strcmp(b->name, base))
-		die("Can't create a branch from itself: %s", base);
-	else if (s) {
-		memcpy(b->sha1, s->sha1, 20);
-		memcpy(b->branch_tree.sha1, s->branch_tree.sha1, 20);
+	/* Obtain the new branch name from the rest of our command */
+	sp = strchr(command_buf.buf, ' ') + 1;
+	str_uq = unquote_c_style(sp, &endp);
+	if (str_uq) {
+		if (*endp)
+			die("Garbage after ref in: %s", command_buf.buf);
+		sp = str_uq;
 	}
-	else if (!get_sha1(base, b->sha1)) {
-		if (!memcmp(b->sha1, null_sha1, 20))
-			memcpy(b->branch_tree.sha1, null_sha1, 20);
-		else {
-			unsigned long size;
-			char *buf;
+	b = new_branch(sp);
+	if (str_uq)
+		free(str_uq);
+	read_next_command();
 
-			buf = read_object_with_reference(b->sha1,
-				type_names[OBJ_COMMIT], &size, b->sha1);
-			if (!buf || size < 46)
-				die("Not a valid commit: %s", base);
-			if (memcmp("tree ", buf, 5)
-				|| get_sha1_hex(buf + 5, b->branch_tree.sha1))
-				die("The commit %s is corrupt", sha1_to_hex(b->sha1));
-			free(buf);
+	/* from ... */
+	if (!strncmp("from ", command_buf.buf, 5)) {
+		const char *from;
+		struct branch *s;
+
+		from = strchr(command_buf.buf, ' ') + 1;
+		str_uq = unquote_c_style(from, &endp);
+		if (str_uq) {
+			if (*endp)
+				die("Garbage after string in: %s", command_buf.buf);
+			from = str_uq;
 		}
-	} else
-		die("Not a SHA1 or branch: %s", base);
+
+		s = lookup_branch(from);
+		if (b == s)
+			die("Can't create a branch from itself: %s", b->name);
+		else if (s) {
+			memcpy(b->sha1, s->sha1, 20);
+			memcpy(b->branch_tree.sha1, s->branch_tree.sha1, 20);
+		} else if (!get_sha1(from, b->sha1)) {
+			if (!memcmp(b->sha1, null_sha1, 20))
+				memcpy(b->branch_tree.sha1, null_sha1, 20);
+			else {
+				unsigned long size;
+				char *buf;
+
+				buf = read_object_with_reference(b->sha1,
+					type_names[OBJ_COMMIT], &size, b->sha1);
+				if (!buf || size < 46)
+					die("Not a valid commit: %s", from);
+				if (memcmp("tree ", buf, 5)
+					|| get_sha1_hex(buf + 5, b->branch_tree.sha1))
+					die("The commit %s is corrupt", sha1_to_hex(b->sha1));
+				free(buf);
+			}
+		} else
+			die("Invalid ref name or SHA1 expression: %s", from);
+
+		if (str_uq)
+			free(str_uq);
+		read_next_command();
+	} else {
+		memcpy(b->sha1, null_sha1, 20);
+		memcpy(b->branch_tree.sha1, null_sha1, 20);
+	}
+
+	if (command_buf.eof || command_buf.len > 1)
+		die("An lf did not terminate the branch command as expected.");
 }
 
 int main(int argc, const char **argv)
@@ -1087,26 +1251,28 @@ int main(int argc, const char **argv)
 	if (pack_fd < 0)
 		die("Can't create %s: %s", pack_name, strerror(errno));
 
+	init_pack_header();
 	alloc_objects(est_obj_cnt);
+	strbuf_init(&command_buf);
 
 	atom_table = xcalloc(atom_table_sz, sizeof(struct atom_str*));
 	branch_table = xcalloc(branch_table_sz, sizeof(struct branch*));
 	avail_tree_table = xcalloc(avail_tree_table_sz, sizeof(struct avail_tree_content*));
 
-	init_pack_header();
 	for (;;) {
-		unsigned long cmd;
-		if (optional_read(0, &cmd, 4))
+		read_next_command();
+		if (command_buf.eof)
 			break;
-
-		switch (ntohl(cmd)) {
-		case 'blob': cmd_new_blob();   break;
-		case 'comt': cmd_new_commit(); break;
-		case 'brch': cmd_new_branch(); break;
-		default:
-			die("Invalid command %lu", cmd);
-		}
+		else if (!strcmp("blob", command_buf.buf))
+			cmd_new_blob();
+		else if (!strncmp("branch ", command_buf.buf, 7))
+			cmd_new_branch();
+		else if (!strncmp("commit ", command_buf.buf, 7))
+			cmd_new_commit();
+		else
+			die("Unsupported command: %s", command_buf.buf);
 	}
+
 	fixup_header_footer();
 	close(pack_fd);
 	write_index(idx_name);

From c90be46abdbd102ab8e9af0303d33976d552ae58 Mon Sep 17 00:00:00 2001
From: "Shawn O. Pearce" <spearce@spearce.org>
Date: Wed, 16 Aug 2006 01:57:57 -0400
Subject: [PATCH 10/81] Changed fast-import's pack header creation to use
 pack.h

Signed-off-by: Shawn O. Pearce <spearce@spearce.org>

Signed-off-by: Shawn O. Pearce <spearce@spearce.org>
---
 fast-import.c | 15 +++++++--------
 1 file changed, 7 insertions(+), 8 deletions(-)

diff --git a/fast-import.c b/fast-import.c
index 2953e80cde..d5651693ba 100644
--- a/fast-import.c
+++ b/fast-import.c
@@ -800,15 +800,14 @@ del_entry:
 
 static void init_pack_header()
 {
-	const char* magic = "PACK";
-	unsigned long version = 3;
-	unsigned long zero = 0;
+	struct pack_header hdr;
 
-	version = htonl(version);
-	ywrite(pack_fd, (char*)magic, 4);
-	ywrite(pack_fd, &version, 4);
-	ywrite(pack_fd, &zero, 4);
-	pack_offset = 4 * 3;
+	hdr.hdr_signature = htonl(PACK_SIGNATURE);
+	hdr.hdr_version = htonl(2);
+	hdr.hdr_entries = 0;
+
+	ywrite(pack_fd, &hdr, sizeof(hdr));
+	pack_offset = sizeof(hdr);
 }
 
 static void fixup_header_footer()

From ace4a9d1ae5efd056c5e57cc76aacee3057a73f7 Mon Sep 17 00:00:00 2001
From: "Shawn O. Pearce" <spearce@spearce.org>
Date: Mon, 21 Aug 2006 03:29:13 -0400
Subject: [PATCH 11/81] Allow symlink blobs in trees during fast-import.

If a frontend is smart enough to import a symlink then we should
let them do so.  We'll assume that they were smart enough to first
generate a blob to hold the link target, as that's how symlinks
get represented in GIT.

Signed-off-by: Shawn O. Pearce <spearce@spearce.org>
---
 fast-import.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/fast-import.c b/fast-import.c
index d5651693ba..7d1ee1dad9 100644
--- a/fast-import.c
+++ b/fast-import.c
@@ -1017,6 +1017,7 @@ static void file_change_m(struct branch *b)
 	switch (mode) {
 	case S_IFREG | 0644:
 	case S_IFREG | 0755:
+	case S_IFLNK:
 	case 0644:
 	case 0755:
 		/* ok */

From afde8dd96dbb81688d7cb22330e4fffcfc7def21 Mon Sep 17 00:00:00 2001
From: "Shawn O. Pearce" <spearce@spearce.org>
Date: Wed, 23 Aug 2006 01:33:47 -0400
Subject: [PATCH 12/81] Fixed segfault in fast-import after growing a tree.

Growing a tree caused all subtrees to be deallocated and put back
into the free list yet those subtree's contents were still actively
in use.  Consequently they were doled out again and got stomped
on elsewhere.  Releasing a tree is now performed in two parts,
either releasing only the content array or releasing the content
array and recursively releasing the subtree(s).

Signed-off-by: Shawn O. Pearce <spearce@spearce.org>
---
 fast-import.c | 15 ++++++++++-----
 1 file changed, 10 insertions(+), 5 deletions(-)

diff --git a/fast-import.c b/fast-import.c
index 7d1ee1dad9..4c2431f0b0 100644
--- a/fast-import.c
+++ b/fast-import.c
@@ -420,11 +420,16 @@ static void release_tree_content(struct tree_content *t)
 {
 	struct avail_tree_content *f = (struct avail_tree_content*)t;
 	unsigned int hc = hc_entries(f->entry_capacity);
+	f->next_avail = avail_tree_table[hc];
+	avail_tree_table[hc] = f;
+}
+
+static void release_tree_content_recursive(struct tree_content *t)
+{
 	unsigned int i;
 	for (i = 0; i < t->entry_count; i++)
 		release_tree_entry(t->entries[i]);
-	f->next_avail = avail_tree_table[hc];
-	avail_tree_table[hc] = f;
+	release_tree_content(t);
 }
 
 static struct tree_content* grow_tree_content(
@@ -459,7 +464,7 @@ static struct tree_entry* new_tree_entry()
 static void release_tree_entry(struct tree_entry *e)
 {
 	if (e->tree)
-		release_tree_content(e->tree);
+		release_tree_content_recursive(e->tree);
 	*((void**)e) = avail_tree_entry;
 	avail_tree_entry = e;
 }
@@ -720,7 +725,7 @@ static int tree_content_set(
 				e->mode = mode;
 				memcpy(e->sha1, sha1, 20);
 				if (e->tree) {
-					release_tree_content(e->tree);
+					release_tree_content_recursive(e->tree);
 					e->tree = NULL;
 				}
 				memcpy(root->sha1, null_sha1, 20);
@@ -986,7 +991,7 @@ static void unload_one_branch()
 		}
 		e->active_next_branch = NULL;
 		if (e->branch_tree.tree) {
-			release_tree_content(e->branch_tree.tree);
+			release_tree_content_recursive(e->branch_tree.tree);
 			e->branch_tree.tree = NULL;
 		}
 		cur_active_branches--;

From d5c57b284e847a56cc1d98b783be95ba94285afe Mon Sep 17 00:00:00 2001
From: "Shawn O. Pearce" <spearce@spearce.org>
Date: Wed, 23 Aug 2006 02:00:31 -0400
Subject: [PATCH 13/81] Converted fast-import to accept standard command line
 parameters.

The following command line options are now accepted before the
pack name:

  --objects=n           # replaces the object count after the pack name
  --depth=n             # delta chain depth to use (default is 10)
  --active-branches=n   # maximum number of branches to keep in memory

Signed-off-by: Shawn O. Pearce <spearce@spearce.org>
---
 fast-import.c | 34 ++++++++++++++++++++++++++++------
 1 file changed, 28 insertions(+), 6 deletions(-)

diff --git a/fast-import.c b/fast-import.c
index 4c2431f0b0..8598493651 100644
--- a/fast-import.c
+++ b/fast-import.c
@@ -178,7 +178,7 @@ struct branch
 
 
 /* Stats and misc. counters */
-static int max_depth = 10;
+static unsigned long max_depth = 10;
 static unsigned long alloc_count;
 static unsigned long branch_count;
 static unsigned long object_count;
@@ -216,9 +216,9 @@ static unsigned int avail_tree_table_sz = 100;
 static struct avail_tree_content **avail_tree_table;
 
 /* Branch data */
-static unsigned int max_active_branches = 5;
-static unsigned int cur_active_branches;
-static unsigned int branch_table_sz = 1039;
+static unsigned long max_active_branches = 5;
+static unsigned long cur_active_branches;
+static unsigned long branch_table_sz = 1039;
 static struct branch **branch_table;
 static struct branch *active_branches;
 
@@ -1236,10 +1236,14 @@ static void cmd_new_branch()
 		die("An lf did not terminate the branch command as expected.");
 }
 
+static const char fast_import_usage[] =
+"git-fast-import [--objects=n] [--depth=n] [--active-branches=n] temp.pack";
+
 int main(int argc, const char **argv)
 {
-	const char *base_name = argv[1];
-	int est_obj_cnt = atoi(argv[2]);
+	const char *base_name;
+	int i;
+	unsigned long est_obj_cnt = 1000;
 	char *pack_name;
 	char *idx_name;
 	struct stat sb;
@@ -1247,6 +1251,24 @@ int main(int argc, const char **argv)
 	setup_ident();
 	git_config(git_default_config);
 
+	for (i = 1; i < argc; i++) {
+		const char *a = argv[i];
+
+		if (*a != '-' || !strcmp(a, "--"))
+			break;
+		else if (!strncmp(a, "--objects=", 10))
+			est_obj_cnt = strtoul(a + 10, NULL, 0);
+		else if (!strncmp(a, "--depth=", 8))
+			max_depth = strtoul(a + 8, NULL, 0);
+		else if (!strncmp(a, "--active-branches=", 18))
+			max_active_branches = strtoul(a + 18, NULL, 0);
+		else
+			die("unknown option %s", a);
+	}
+	if ((i+1) != argc)
+		usage(fast_import_usage);
+	base_name = argv[i];
+
 	pack_name = xmalloc(strlen(base_name) + 6);
 	sprintf(pack_name, "%s.pack", base_name);
 	idx_name = xmalloc(strlen(base_name) + 5);

From d83971688ba42e4cd37908f4d776801a997ca421 Mon Sep 17 00:00:00 2001
From: "Shawn O. Pearce" <spearce@spearce.org>
Date: Wed, 23 Aug 2006 04:17:45 -0400
Subject: [PATCH 14/81] Added mark store/find to fast-import.

Marks are now saved when the mark directive gets used by the frontend
and may be used in place of a SHA1 expression to locate a previous
SHA1 which fast-import may have generated.  This is particularly
useful with commits where the frontend does not (easily) have the
ability to compute the SHA1 for an arbitrary commit but needs it
to generate a branch or tag from that commit.

Signed-off-by: Shawn O. Pearce <spearce@spearce.org>
---
 fast-import.c | 104 +++++++++++++++++++++++++++++++++++++++++---------
 1 file changed, 87 insertions(+), 17 deletions(-)

diff --git a/fast-import.c b/fast-import.c
index 8598493651..6e2f106a1a 100644
--- a/fast-import.c
+++ b/fast-import.c
@@ -121,6 +121,15 @@ struct object_entry_pool
 	struct object_entry entries[FLEX_ARRAY]; /* more */
 };
 
+struct mark_set
+{
+	int shift;
+	union {
+		struct object_entry *marked[1024];
+		struct mark_set *sets[1024];
+	} data;
+};
+
 struct last_object
 {
 	void *data;
@@ -183,6 +192,7 @@ static unsigned long alloc_count;
 static unsigned long branch_count;
 static unsigned long object_count;
 static unsigned long duplicate_count;
+static unsigned long marks_set_count;
 static unsigned long object_count_by_type[9];
 static unsigned long duplicate_count_by_type[9];
 
@@ -205,6 +215,7 @@ static unsigned char pack_sha1[20];
 static unsigned int object_entry_alloc = 1000;
 static struct object_entry_pool *blocks;
 static struct object_entry *object_table[1 << 16];
+static struct mark_set *marks;
 
 /* Our last blob */
 static struct last_object last_blob;
@@ -224,7 +235,7 @@ static struct branch *active_branches;
 
 /* Input stream parsing */
 static struct strbuf command_buf;
-static unsigned long command_mark;
+static unsigned long next_mark;
 
 
 static void alloc_objects(int cnt)
@@ -335,6 +346,48 @@ static char* pool_strdup(const char *s)
 	return r;
 }
 
+static void insert_mark(unsigned long idnum, struct object_entry *oe)
+{
+	struct mark_set *s = marks;
+	while ((idnum >> s->shift) >= 1024) {
+		s = pool_calloc(1, sizeof(struct mark_set));
+		s->shift = marks->shift + 10;
+		s->data.sets[0] = marks;
+		marks = s;
+	}
+	while (s->shift) {
+		unsigned long i = idnum >> s->shift;
+		idnum -= i << s->shift;
+		if (!s->data.sets[i]) {
+			s->data.sets[i] = pool_calloc(1, sizeof(struct mark_set));
+			s->data.sets[i]->shift = s->shift - 10;
+		}
+		s = s->data.sets[i];
+	}
+	if (!s->data.marked[idnum])
+		marks_set_count++;
+	s->data.marked[idnum] = oe;
+}
+
+static struct object_entry* find_mark(unsigned long idnum)
+{
+	unsigned long orig_idnum = idnum;
+	struct mark_set *s = marks;
+	struct object_entry *oe = NULL;
+	if ((idnum >> s->shift) < 1024) {
+		while (s && s->shift) {
+			unsigned long i = idnum >> s->shift;
+			idnum -= i << s->shift;
+			s = s->data.sets[i];
+		}
+		if (s)
+			oe = s->data.marked[idnum];
+	}
+	if (!oe)
+		die("mark :%lu not declared", orig_idnum);
+	return oe;
+}
+
 static struct atom_str* to_atom(const char *s, size_t len)
 {
 	unsigned int hc = hc_str(s, len) % atom_table_sz;
@@ -523,7 +576,8 @@ static int store_object(
 	void *dat,
 	size_t datlen,
 	struct last_object *last,
-	unsigned char *sha1out)
+	unsigned char *sha1out,
+	unsigned long mark)
 {
 	void *out, *delta;
 	struct object_entry *e;
@@ -542,6 +596,8 @@ static int store_object(
 		memcpy(sha1out, sha1, sizeof(sha1));
 
 	e = insert_object(sha1);
+	if (mark)
+		insert_mark(mark, e);
 	if (e->offset) {
 		duplicate_count++;
 		duplicate_count_by_type[type]++;
@@ -695,7 +751,7 @@ static void store_tree(struct tree_entry *root)
 		memcpy(c, e->sha1, 20);
 		c += 20;
 	}
-	store_object(OBJ_TREE, buf, c - buf, NULL, root->sha1);
+	store_object(OBJ_TREE, buf, c - buf, NULL, root->sha1, 0);
 	free(buf);
 }
 
@@ -921,11 +977,11 @@ static void read_next_command()
 static void cmd_mark()
 {
 	if (!strncmp("mark :", command_buf.buf, 6)) {
-		command_mark = strtoul(command_buf.buf + 6, NULL, 10);
+		next_mark = strtoul(command_buf.buf + 6, NULL, 10);
 		read_next_command();
 	}
 	else
-		command_mark = 0;
+		next_mark = 0;
 }
 
 static void* cmd_data (size_t *size)
@@ -956,16 +1012,15 @@ static void* cmd_data (size_t *size)
 
 static void cmd_new_blob()
 {
-	size_t datlen;
-	void *dat;
-	unsigned char sha1[20];
+	size_t l;
+	void *d;
 
 	read_next_command();
 	cmd_mark();
-	dat = cmd_data(&datlen);
+	d = cmd_data(&l);
 
-	if (store_object(OBJ_BLOB, dat, datlen, &last_blob, sha1))
-		free(dat);
+	if (store_object(OBJ_BLOB, d, l, &last_blob, NULL, next_mark))
+		free(d);
 }
 
 static void unload_one_branch()
@@ -1031,9 +1086,16 @@ static void file_change_m(struct branch *b)
 		die("Corrupt mode: %s", command_buf.buf);
 	}
 
-	if (get_sha1_hex(p, sha1))
-		die("Invalid SHA1: %s", command_buf.buf);
-	p += 40;
+	if (*p == ':') {
+		char *x;
+		oe = find_mark(strtoul(p + 1, &x, 10));
+		p = x;
+	} else {
+		if (get_sha1_hex(p, sha1))
+			die("Invalid SHA1: %s", command_buf.buf);
+		oe = find_object(sha1);
+		p += 40;
+	}
 	if (*p++ != ' ')
 		die("Missing space after SHA1: %s", command_buf.buf);
 
@@ -1044,7 +1106,6 @@ static void file_change_m(struct branch *b)
 		p = p_uq;
 	}
 
-	oe = find_object(sha1);
 	if (oe) {
 		if (oe->type != OBJ_BLOB)
 			die("Not a blob (actually a %s): %s",
@@ -1161,7 +1222,7 @@ static void cmd_new_commit()
 	free(committer);
 	free(msg);
 
-	store_object(OBJ_COMMIT, body, sp - body, NULL, b->sha1);
+	store_object(OBJ_COMMIT, body, sp - body, NULL, b->sha1, next_mark);
 	free(body);
 	b->last_commit = object_count_by_type[OBJ_COMMIT];
 }
@@ -1205,6 +1266,13 @@ static void cmd_new_branch()
 		else if (s) {
 			memcpy(b->sha1, s->sha1, 20);
 			memcpy(b->branch_tree.sha1, s->branch_tree.sha1, 20);
+		} else if (*from == ':') {
+			unsigned long idnum = strtoul(from + 1, NULL, 10);
+			struct object_entry *oe = find_mark(idnum);
+			if (oe->type != OBJ_COMMIT)
+				die("Mark :%lu not a commit", idnum);
+			memcpy(b->sha1, oe->sha1, 20);
+			memcpy(b->branch_tree.sha1, null_sha1, 20);
 		} else if (!get_sha1(from, b->sha1)) {
 			if (!memcmp(b->sha1, null_sha1, 20))
 				memcpy(b->branch_tree.sha1, null_sha1, 20);
@@ -1285,6 +1353,7 @@ int main(int argc, const char **argv)
 	atom_table = xcalloc(atom_table_sz, sizeof(struct atom_str*));
 	branch_table = xcalloc(branch_table_sz, sizeof(struct branch*));
 	avail_tree_table = xcalloc(avail_tree_table_sz, sizeof(struct avail_tree_content*));
+	marks = pool_calloc(1, sizeof(struct mark_set));
 
 	for (;;) {
 		read_next_command();
@@ -1314,7 +1383,8 @@ int main(int argc, const char **argv)
 	fprintf(stderr, "      commits:   %10lu (%10lu duplicates)\n", object_count_by_type[OBJ_COMMIT], duplicate_count_by_type[OBJ_COMMIT]);
 	fprintf(stderr, "      tags   :   %10lu (%10lu duplicates)\n", object_count_by_type[OBJ_TAG], duplicate_count_by_type[OBJ_TAG]);
 	fprintf(stderr, "Total branches:  %10lu\n", branch_count);
-	fprintf(stderr, "Total atoms:     %10u\n", atom_cnt);
+	fprintf(stderr, "      atoms:     %10u\n", atom_cnt);
+	fprintf(stderr, "      marks:     %10u (%10lu unique    )\n", (1 << marks->shift) * 1024, marks_set_count);
 	fprintf(stderr, "Memory total:    %10lu KiB\n", (total_allocd + alloc_count*sizeof(struct object_entry))/1024);
 	fprintf(stderr, "       pools:    %10lu KiB\n", total_allocd/1024);
 	fprintf(stderr, "     objects:    %10lu KiB\n", (alloc_count*sizeof(struct object_entry))/1024);

From d6c7eb2c160fc40c48fd25fdae15c193eec13bb7 Mon Sep 17 00:00:00 2001
From: "Shawn O. Pearce" <spearce@spearce.org>
Date: Wed, 23 Aug 2006 04:31:12 -0400
Subject: [PATCH 15/81] Added branch load counter to fast-import.

If the branch load count exceeds the number of branches created then
the frontend is causing fast-import to page branches into and out of
memory due to the way its ordering its commits.  Performance can
likely be increased if the frontend were to alter its commit
sequence such that it stays on one branch before switching to another
branch, then never returns to the prior branch.

Signed-off-by: Shawn O. Pearce <spearce@spearce.org>
---
 fast-import.c | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/fast-import.c b/fast-import.c
index 6e2f106a1a..50171d69ca 100644
--- a/fast-import.c
+++ b/fast-import.c
@@ -190,6 +190,7 @@ struct branch
 static unsigned long max_depth = 10;
 static unsigned long alloc_count;
 static unsigned long branch_count;
+static unsigned long branch_load_count;
 static unsigned long object_count;
 static unsigned long duplicate_count;
 static unsigned long marks_set_count;
@@ -1059,6 +1060,7 @@ static void load_branch(struct branch *b)
 	b->active_next_branch = active_branches;
 	active_branches = b;
 	cur_active_branches++;
+	branch_load_count++;
 }
 
 static void file_change_m(struct branch *b)
@@ -1382,9 +1384,9 @@ int main(int argc, const char **argv)
 	fprintf(stderr, "      trees  :   %10lu (%10lu duplicates)\n", object_count_by_type[OBJ_TREE], duplicate_count_by_type[OBJ_TREE]);
 	fprintf(stderr, "      commits:   %10lu (%10lu duplicates)\n", object_count_by_type[OBJ_COMMIT], duplicate_count_by_type[OBJ_COMMIT]);
 	fprintf(stderr, "      tags   :   %10lu (%10lu duplicates)\n", object_count_by_type[OBJ_TAG], duplicate_count_by_type[OBJ_TAG]);
-	fprintf(stderr, "Total branches:  %10lu\n", branch_count);
-	fprintf(stderr, "      atoms:     %10u\n", atom_cnt);
+	fprintf(stderr, "Total branches:  %10lu (%10lu loads     )\n", branch_count, branch_load_count);
 	fprintf(stderr, "      marks:     %10u (%10lu unique    )\n", (1 << marks->shift) * 1024, marks_set_count);
+	fprintf(stderr, "      atoms:     %10u\n", atom_cnt);
 	fprintf(stderr, "Memory total:    %10lu KiB\n", (total_allocd + alloc_count*sizeof(struct object_entry))/1024);
 	fprintf(stderr, "       pools:    %10lu KiB\n", total_allocd/1024);
 	fprintf(stderr, "     objects:    %10lu KiB\n", (alloc_count*sizeof(struct object_entry))/1024);

From 72303d44e9d8f3fc9bef039b472a2bd259509420 Mon Sep 17 00:00:00 2001
From: "Shawn O. Pearce" <spearce@spearce.org>
Date: Thu, 24 Aug 2006 03:12:13 -0400
Subject: [PATCH 16/81] Implemented 'tag' command in fast-import.

Tags received from the frontend are generated in memory in a simple
linked list in the order that the tag commands were sent by the
frontend.  If multiple different tag objects for the same tag name
get generated the last one sent by the frontend will be the one
that gets written out at termination.  Multiple tag objects for
the same name will cause all older tags of the same name to be lost.

Signed-off-by: Shawn O. Pearce <spearce@spearce.org>
---
 fast-import.c | 125 ++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 125 insertions(+)

diff --git a/fast-import.c b/fast-import.c
index 50171d69ca..e692f6b430 100644
--- a/fast-import.c
+++ b/fast-import.c
@@ -185,6 +185,13 @@ struct branch
 	unsigned char sha1[20];
 };
 
+struct tag
+{
+	struct tag *next_tag;
+	const char *name;
+	unsigned char sha1[20];
+};
+
 
 /* Stats and misc. counters */
 static unsigned long max_depth = 10;
@@ -234,6 +241,10 @@ static unsigned long branch_table_sz = 1039;
 static struct branch **branch_table;
 static struct branch *active_branches;
 
+/* Tag data */
+static struct tag *first_tag;
+static struct tag *last_tag;
+
 /* Input stream parsing */
 static struct strbuf command_buf;
 static unsigned long next_mark;
@@ -970,6 +981,21 @@ static void dump_branches()
 	}
 }
 
+static void dump_tags()
+{
+	static const char *msg = "fast-import";
+	struct tag *t;
+	struct ref_lock *lock;
+	char path[PATH_MAX];
+
+	for (t = first_tag; t; t = t->next_tag) {
+		sprintf(path, "refs/tags/%s", t->name);
+		lock = lock_any_ref_for_update(path, NULL, 0);
+		if (!lock || write_ref_sha1(lock, t->sha1, msg) < 0)
+			die("Can't write %s", path);
+	}
+}
+
 static void read_next_command()
 {
 	read_line(&command_buf, stdin, '\n');
@@ -1306,6 +1332,102 @@ static void cmd_new_branch()
 		die("An lf did not terminate the branch command as expected.");
 }
 
+static void cmd_new_tag()
+{
+	char *str_uq;
+	const char *endp;
+	char *sp;
+	const char *from;
+	char *tagger;
+	struct branch *s;
+	void *msg;
+	size_t msglen;
+	char *body;
+	struct tag *t;
+	unsigned char sha1[20];
+
+	/* Obtain the new tag name from the rest of our command */
+	sp = strchr(command_buf.buf, ' ') + 1;
+	str_uq = unquote_c_style(sp, &endp);
+	if (str_uq) {
+		if (*endp)
+			die("Garbage after tag name in: %s", command_buf.buf);
+		sp = str_uq;
+	}
+	t = pool_alloc(sizeof(struct tag));
+	t->next_tag = NULL;
+	t->name = pool_strdup(sp);
+	if (last_tag)
+		last_tag->next_tag = t;
+	else
+		first_tag = t;
+	last_tag = t;
+	if (str_uq)
+		free(str_uq);
+	read_next_command();
+
+	/* from ... */
+	if (strncmp("from ", command_buf.buf, 5))
+		die("Expected from command, got %s", command_buf.buf);
+
+	from = strchr(command_buf.buf, ' ') + 1;
+	str_uq = unquote_c_style(from, &endp);
+	if (str_uq) {
+		if (*endp)
+			die("Garbage after string in: %s", command_buf.buf);
+		from = str_uq;
+	}
+
+	s = lookup_branch(from);
+	if (s) {
+		memcpy(sha1, s->sha1, 20);
+	} else if (*from == ':') {
+		unsigned long idnum = strtoul(from + 1, NULL, 10);
+		struct object_entry *oe = find_mark(idnum);
+		if (oe->type != OBJ_COMMIT)
+			die("Mark :%lu not a commit", idnum);
+		memcpy(sha1, oe->sha1, 20);
+	} else if (!get_sha1(from, sha1)) {
+		unsigned long size;
+		char *buf;
+
+		buf = read_object_with_reference(sha1,
+			type_names[OBJ_COMMIT], &size, sha1);
+		if (!buf || size < 46)
+			die("Not a valid commit: %s", from);
+		free(buf);
+	} else
+		die("Invalid ref name or SHA1 expression: %s", from);
+
+	if (str_uq)
+		free(str_uq);
+	read_next_command();
+
+	/* tagger ... */
+	if (strncmp("tagger ", command_buf.buf, 7))
+		die("Expected tagger command, got %s", command_buf.buf);
+	tagger = strdup(command_buf.buf);
+
+	/* tag payload/message */
+	read_next_command();
+	msg = cmd_data(&msglen);
+
+	/* build the tag object */
+	body = xmalloc(67 + strlen(t->name) + strlen(tagger) + msglen);
+	sp = body;
+	sp += sprintf(sp, "object %s\n", sha1_to_hex(sha1));
+	sp += sprintf(sp, "type %s\n", type_names[OBJ_COMMIT]);
+	sp += sprintf(sp, "tag %s\n", t->name);
+	sp += sprintf(sp, "%s\n\n", tagger);
+	memcpy(sp, msg, msglen);
+	sp += msglen;
+	free(tagger);
+	free(msg);
+
+	store_object(OBJ_TAG, body, sp - body, NULL, t->sha1, 0);
+	free(body);
+}
+
 static const char fast_import_usage[] =
 "git-fast-import [--objects=n] [--depth=n] [--active-branches=n] temp.pack";
 
@@ -1367,6 +1489,8 @@ int main(int argc, const char **argv)
 			cmd_new_branch();
 		else if (!strncmp("commit ", command_buf.buf, 7))
 			cmd_new_commit();
+		else if (!strncmp("tag ", command_buf.buf, 4))
+			cmd_new_tag();
 		else
 			die("Unsupported command: %s", command_buf.buf);
 	}
@@ -1375,6 +1499,7 @@ int main(int argc, const char **argv)
 	close(pack_fd);
 	write_index(idx_name);
 	dump_branches();
+	dump_tags();
 
 	fprintf(stderr, "%s statistics:\n", argv[0]);
 	fprintf(stderr, "---------------------------------------------------\n");

From 41e5257fcf4db31dfa2576aac1f50b140f2bb058 Mon Sep 17 00:00:00 2001
From: "Shawn O. Pearce" <spearce@spearce.org>
Date: Thu, 24 Aug 2006 04:37:35 -0400
Subject: [PATCH 17/81] Implemented tree reloading in fast-import.

Tree reloading allows fast-import to swap out the least-recently used
branch by simply deallocating the data structures from memory that
were associated with that branch.  Later if the branch becomes active
again it can lazily recreate those structures on demand by reloading
the necessary trees from the pack file it originally wrote them to.

The reloading process is implemented by mmap'ing the pack into
memory and using a much tighter variant of the pack reading code
contained in sha1_file.c.  This was a blatent copy from sha1_file.c
but the unpacking functions were significantly simplified and are
actually now in a form that should make it easier to map only the
necessary regions of a pack rather than the entire file.

Signed-off-by: Shawn O. Pearce <spearce@spearce.org>
---
 fast-import.c | 162 ++++++++++++++++++++++++++++++++++++++++++++++----
 1 file changed, 149 insertions(+), 13 deletions(-)

diff --git a/fast-import.c b/fast-import.c
index e692f6b430..1c74b90c84 100644
--- a/fast-import.c
+++ b/fast-import.c
@@ -198,6 +198,7 @@ static unsigned long max_depth = 10;
 static unsigned long alloc_count;
 static unsigned long branch_count;
 static unsigned long branch_load_count;
+static unsigned long remap_count;
 static unsigned long object_count;
 static unsigned long duplicate_count;
 static unsigned long marks_set_count;
@@ -216,8 +217,10 @@ static struct atom_str **atom_table;
 
 /* The .pack file being generated */
 static int pack_fd;
-static unsigned long pack_offset;
+static unsigned long pack_size;
 static unsigned char pack_sha1[20];
+static void* pack_base;
+static size_t pack_mlen;
 
 /* Table of objects we've written. */
 static unsigned int object_entry_alloc = 1000;
@@ -616,7 +619,7 @@ static int store_object(
 		return 1;
 	}
 	e->type = type;
-	e->offset = pack_offset;
+	e->offset = pack_size;
 	object_count++;
 	object_count_by_type[type]++;
 
@@ -637,7 +640,7 @@ static int store_object(
 		hdrlen = encode_header(OBJ_DELTA, deltalen, hdr);
 		ywrite(pack_fd, hdr, hdrlen);
 		ywrite(pack_fd, last->sha1, sizeof(sha1));
-		pack_offset += hdrlen + sizeof(sha1);
+		pack_size += hdrlen + sizeof(sha1);
 	} else {
 		if (last)
 			last->depth = 0;
@@ -645,7 +648,7 @@ static int store_object(
 		s.avail_in = datlen;
 		hdrlen = encode_header(type, datlen, hdr);
 		ywrite(pack_fd, hdr, hdrlen);
-		pack_offset += hdrlen;
+		pack_size += hdrlen;
 	}
 
 	s.avail_out = deflateBound(&s, s.avail_in);
@@ -655,7 +658,7 @@ static int store_object(
 	deflateEnd(&s);
 
 	ywrite(pack_fd, out, s.total_out);
-	pack_offset += s.total_out;
+	pack_size += s.total_out;
 
 	free(out);
 	if (delta)
@@ -670,6 +673,127 @@ static int store_object(
 	return 0;
 }
 
+static void* map_pack(unsigned long offset)
+{
+	if (offset >= pack_size)
+		die("object offset outside of pack file");
+	if (offset >= pack_mlen) {
+		if (pack_base)
+			munmap(pack_base, pack_mlen);
+		/* round out how much we map to 16 MB units */
+		pack_mlen = pack_size;
+		if (pack_mlen & ((1 << 24) - 1))
+			pack_mlen = ((pack_mlen >> 24) + 1) << 24;
+		pack_base = mmap(NULL,pack_mlen,PROT_READ,MAP_SHARED,pack_fd,0);
+		if (pack_base == MAP_FAILED)
+			die("Failed to map generated pack: %s", strerror(errno));
+		remap_count++;
+	}
+	return (char*)pack_base + offset;
+}
+
+static unsigned long unpack_object_header(unsigned long offset,
+	enum object_type *type,
+	unsigned long *sizep)
+{
+	unsigned shift;
+	unsigned char c;
+	unsigned long size;
+
+	c = *(unsigned char*)map_pack(offset++);
+	*type = (c >> 4) & 7;
+	size = c & 15;
+	shift = 4;
+	while (c & 0x80) {
+		c = *(unsigned char*)map_pack(offset++);
+		size += (c & 0x7f) << shift;
+		shift += 7;
+	}
+	*sizep = size;
+	return offset;
+}
+
+static void *unpack_non_delta_entry(unsigned long o, unsigned long sz)
+{
+	z_stream stream;
+	unsigned char *result;
+
+	result = xmalloc(sz + 1);
+	result[sz] = 0;
+
+	memset(&stream, 0, sizeof(stream));
+	stream.next_in = map_pack(o);
+	stream.avail_in = pack_mlen - o;
+	stream.next_out = result;
+	stream.avail_out = sz;
+
+	inflateInit(&stream);
+	for (;;) {
+		int st = inflate(&stream, Z_FINISH);
+		if (st == Z_STREAM_END)
+			break;
+		if (st == Z_OK) {
+			o = stream.next_in - (unsigned char*)pack_base;
+			stream.next_in = map_pack(o);
+			stream.avail_in = pack_mlen - o;
+			continue;
+		}
+		die("Error from zlib during inflate.");
+	}
+	inflateEnd(&stream);
+	if (stream.total_out != sz)
+		die("Error after inflate: sizes mismatch");
+	return result;
+}
+
+static void *unpack_entry(unsigned long offset, unsigned long *sizep);
+
+static void *unpack_delta_entry(unsigned long offset,
+	unsigned long delta_size,
+	unsigned long *sizep)
+{
+	struct object_entry *base_oe;
+	unsigned char *base_sha1;
+	void *delta_data, *base, *result;
+	unsigned long base_size, result_size;
+
+	base_sha1 = (unsigned char*)map_pack(offset + 20) - 20;
+	base_oe = find_object(base_sha1);
+	if (!base_oe)
+		die("I'm broken; I can't find a base I know must be here.");
+	base = unpack_entry(base_oe->offset, &base_size);
+	delta_data = unpack_non_delta_entry(offset + 20, delta_size);
+	result = patch_delta(base, base_size,
+			     delta_data, delta_size,
+			     &result_size);
+	if (!result)
+		die("failed to apply delta");
+	free(delta_data);
+	free(base);
+	*sizep = result_size;
+	return result;
+}
+
+static void *unpack_entry(unsigned long offset, unsigned long *sizep)
+{
+	unsigned long size;
+	enum object_type kind;
+
+	offset = unpack_object_header(offset, &kind, &size);
+	switch (kind) {
+	case OBJ_DELTA:
+		return unpack_delta_entry(offset, size, sizep);
+	case OBJ_COMMIT:
+	case OBJ_TREE:
+	case OBJ_BLOB:
+	case OBJ_TAG:
+		*sizep = size;
+		return unpack_non_delta_entry(offset, size);
+	default:
+		die("I created an object I can't read!");
+	}
+}
+
 static const char *get_mode(const char *str, unsigned int *modep)
 {
 	unsigned char c;
@@ -691,7 +815,6 @@ static void load_tree(struct tree_entry *root)
 	unsigned long size;
 	char *buf;
 	const char *c;
-	char type[20];
 
 	root->tree = t = new_tree_content(8);
 	if (!memcmp(root->sha1, null_sha1, 20))
@@ -699,11 +822,14 @@ static void load_tree(struct tree_entry *root)
 
 	myoe = find_object(root->sha1);
 	if (myoe) {
-		die("FIXME");
+		if (myoe->type != OBJ_TREE)
+			die("Not a tree: %s", sha1_to_hex(root->sha1));
+		buf = unpack_entry(myoe->offset, &size);
 	} else {
+		char type[20];
 		buf = read_sha1_file(root->sha1, type, &size);
-		if (!buf || strcmp(type, tree_type))
-			die("Can't load existing tree %s", sha1_to_hex(root->sha1));
+		if (!buf || !strcmp(type, tree_type))
+			die("Can't load tree %s", sha1_to_hex(root->sha1));
 	}
 
 	c = buf;
@@ -880,7 +1006,7 @@ static void init_pack_header()
 	hdr.hdr_entries = 0;
 
 	ywrite(pack_fd, &hdr, sizeof(hdr));
-	pack_offset = sizeof(hdr);
+	pack_size = sizeof(hdr);
 }
 
 static void fixup_header_footer()
@@ -1052,7 +1178,8 @@ static void cmd_new_blob()
 
 static void unload_one_branch()
 {
-	while (cur_active_branches >= max_active_branches) {
+	while (cur_active_branches
+		&& cur_active_branches >= max_active_branches) {
 		unsigned long min_commit = ULONG_MAX;
 		struct branch *e, *l = NULL, *p = NULL;
 
@@ -1210,7 +1337,7 @@ static void cmd_new_commit()
 	msg = cmd_data(&msglen);
 
 	/* ensure the branch is active/loaded */
-	if (!b->branch_tree.tree) {
+	if (!b->branch_tree.tree || !max_active_branches) {
 		unload_one_branch();
 		load_branch(b);
 	}
@@ -1297,10 +1424,18 @@ static void cmd_new_branch()
 		} else if (*from == ':') {
 			unsigned long idnum = strtoul(from + 1, NULL, 10);
 			struct object_entry *oe = find_mark(idnum);
+			unsigned long size;
+			char *buf;
 			if (oe->type != OBJ_COMMIT)
 				die("Mark :%lu not a commit", idnum);
 			memcpy(b->sha1, oe->sha1, 20);
-			memcpy(b->branch_tree.sha1, null_sha1, 20);
+			buf = unpack_entry(oe->offset, &size);
+			if (!buf || size < 46)
+				die("Not a valid commit: %s", from);
+			if (memcmp("tree ", buf, 5)
+				|| get_sha1_hex(buf + 5, b->branch_tree.sha1))
+				die("The commit %s is corrupt", sha1_to_hex(b->sha1));
+			free(buf);
 		} else if (!get_sha1(from, b->sha1)) {
 			if (!memcmp(b->sha1, null_sha1, 20))
 				memcpy(b->branch_tree.sha1, null_sha1, 20);
@@ -1515,6 +1650,7 @@ int main(int argc, const char **argv)
 	fprintf(stderr, "Memory total:    %10lu KiB\n", (total_allocd + alloc_count*sizeof(struct object_entry))/1024);
 	fprintf(stderr, "       pools:    %10lu KiB\n", total_allocd/1024);
 	fprintf(stderr, "     objects:    %10lu KiB\n", (alloc_count*sizeof(struct object_entry))/1024);
+	fprintf(stderr, "Pack remaps:     %10lu\n", remap_count);
 	fprintf(stderr, "---------------------------------------------------\n");
 
 	stat(pack_name, &sb);

From 8d8928b0511313ba1740d39c3920f8f12f36a10a Mon Sep 17 00:00:00 2001
From: "Shawn O. Pearce" <spearce@spearce.org>
Date: Thu, 24 Aug 2006 04:46:29 -0400
Subject: [PATCH 18/81] Round out memory pool allocations in fast-import to
 pointer sizes.

Some architectures (e.g. SPARC) would require that we access pointers
only on pointer-sized alignments.  So ensure the pool allocator
rounds out non-pointer sized allocations to the next pointer so we
don't generate bad memory addresses.  This could have occurred if
we had previously allocated an atom whose string was not a whole
multiple of the pointer size, for example.

Signed-off-by: Shawn O. Pearce <spearce@spearce.org>
---
 fast-import.c | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/fast-import.c b/fast-import.c
index 1c74b90c84..e42bdbd3a3 100644
--- a/fast-import.c
+++ b/fast-import.c
@@ -342,6 +342,9 @@ static void* pool_alloc(size_t len)
 	}
 
 	r = p->next_free;
+	/* round out to a pointer alignment */
+	if (len & (sizeof(void*) - 1))
+		len += sizeof(void*) - (len & (sizeof(void*) - 1));
 	p->next_free += len;
 	return r;
 }

From 00e2b8842c58e451fcf8038287c8420423bab50a Mon Sep 17 00:00:00 2001
From: "Shawn O. Pearce" <spearce@spearce.org>
Date: Thu, 24 Aug 2006 18:45:26 -0400
Subject: [PATCH 19/81] Remove branch creation command from fast-import.

Jon Smirl was finding it difficult to alter cvs2svn to generate
branch commands prior to the first commit of the same branch.
This change moves the 'from' command to be an optional parameter of
the 'commit' command, thereby allowing a new branch to be defined
at the moment it gets used to create the first commit on that branch.

This change makes it impossible to create a branch with no commits
on it as at least one commit is needed to register the branch.

Signed-off-by: Shawn O. Pearce <spearce@spearce.org>
---
 fast-import.c | 169 +++++++++++++++++++++-----------------------------
 1 file changed, 71 insertions(+), 98 deletions(-)

diff --git a/fast-import.c b/fast-import.c
index e42bdbd3a3..3e527edf70 100644
--- a/fast-import.c
+++ b/fast-import.c
@@ -4,7 +4,6 @@ Format of STDIN stream:
   stream ::= cmd*;
 
   cmd ::= new_blob
-        | new_branch
         | new_commit
         | new_tag
         ;
@@ -14,15 +13,12 @@ Format of STDIN stream:
     file_content;
   file_content ::= data;
 
-  new_branch ::= 'branch' sp ref_str lf
-    ('from' sp (ref_str | hexsha1 | sha1exp_str | idnum) lf)?
-    lf;
-
   new_commit ::= 'commit' sp ref_str lf
-	mark?
-	('author' sp name '<' email '>' ts tz lf)?
-	'committer' sp name '<' email '>' ts tz lf
-	commit_msg
+    ('from' sp (ref_str | hexsha1 | sha1exp_str | idnum) lf)?
+    mark?
+    ('author' sp name '<' email '>' ts tz lf)?
+    'committer' sp name '<' email '>' ts tz lf
+    commit_msg
     file_change*
     lf;
   commit_msg ::= data;
@@ -831,7 +827,7 @@ static void load_tree(struct tree_entry *root)
 	} else {
 		char type[20];
 		buf = read_sha1_file(root->sha1, type, &size);
-		if (!buf || !strcmp(type, tree_type))
+		if (!buf || strcmp(type, tree_type))
 			die("Can't load tree %s", sha1_to_hex(root->sha1));
 	}
 
@@ -1299,6 +1295,69 @@ static void file_change_d(struct branch *b)
 		free(p_uq);
 }
 
+static void cmd_from(struct branch *b)
+{
+	const char *from, *endp;
+	char *str_uq;
+	struct branch *s;
+
+	if (strncmp("from ", command_buf.buf, 5))
+		return;
+
+	if (b->last_commit)
+		die("Can't reinitailize branch %s", b->name);
+
+	from = strchr(command_buf.buf, ' ') + 1;
+	str_uq = unquote_c_style(from, &endp);
+	if (str_uq) {
+		if (*endp)
+			die("Garbage after string in: %s", command_buf.buf);
+		from = str_uq;
+	}
+
+	s = lookup_branch(from);
+	if (b == s)
+		die("Can't create a branch from itself: %s", b->name);
+	else if (s) {
+		memcpy(b->sha1, s->sha1, 20);
+		memcpy(b->branch_tree.sha1, s->branch_tree.sha1, 20);
+	} else if (*from == ':') {
+		unsigned long idnum = strtoul(from + 1, NULL, 10);
+		struct object_entry *oe = find_mark(idnum);
+		unsigned long size;
+		char *buf;
+		if (oe->type != OBJ_COMMIT)
+			die("Mark :%lu not a commit", idnum);
+		memcpy(b->sha1, oe->sha1, 20);
+		buf = unpack_entry(oe->offset, &size);
+		if (!buf || size < 46)
+			die("Not a valid commit: %s", from);
+		if (memcmp("tree ", buf, 5)
+			|| get_sha1_hex(buf + 5, b->branch_tree.sha1))
+			die("The commit %s is corrupt", sha1_to_hex(b->sha1));
+		free(buf);
+	} else if (!get_sha1(from, b->sha1)) {
+		if (!memcmp(b->sha1, null_sha1, 20))
+			memcpy(b->branch_tree.sha1, null_sha1, 20);
+		else {
+			unsigned long size;
+			char *buf;
+
+			buf = read_object_with_reference(b->sha1,
+				type_names[OBJ_COMMIT], &size, b->sha1);
+			if (!buf || size < 46)
+				die("Not a valid commit: %s", from);
+			if (memcmp("tree ", buf, 5)
+				|| get_sha1_hex(buf + 5, b->branch_tree.sha1))
+				die("The commit %s is corrupt", sha1_to_hex(b->sha1));
+			free(buf);
+		}
+	} else
+		die("Invalid ref name or SHA1 expression: %s", from);
+
+	read_next_command();
+}
+
 static void cmd_new_commit()
 {
 	struct branch *b;
@@ -1321,11 +1380,12 @@ static void cmd_new_commit()
 	}
 	b = lookup_branch(sp);
 	if (!b)
-		die("Branch not declared: %s", sp);
+		b = new_branch(sp);
 	if (str_uq)
 		free(str_uq);
 
 	read_next_command();
+	cmd_from(b);
 	cmd_mark();
 	if (!strncmp("author ", command_buf.buf, 7)) {
 		author = strdup(command_buf.buf);
@@ -1385,91 +1445,6 @@ static void cmd_new_commit()
 	b->last_commit = object_count_by_type[OBJ_COMMIT];
 }
 
-static void cmd_new_branch()
-{
-	struct branch *b;
-	char *str_uq;
-	const char *endp;
-	char *sp;
-
-	/* Obtain the new branch name from the rest of our command */
-	sp = strchr(command_buf.buf, ' ') + 1;
-	str_uq = unquote_c_style(sp, &endp);
-	if (str_uq) {
-		if (*endp)
-			die("Garbage after ref in: %s", command_buf.buf);
-		sp = str_uq;
-	}
-	b = new_branch(sp);
-	if (str_uq)
-		free(str_uq);
-	read_next_command();
-
-	/* from ... */
-	if (!strncmp("from ", command_buf.buf, 5)) {
-		const char *from;
-		struct branch *s;
-
-		from = strchr(command_buf.buf, ' ') + 1;
-		str_uq = unquote_c_style(from, &endp);
-		if (str_uq) {
-			if (*endp)
-				die("Garbage after string in: %s", command_buf.buf);
-			from = str_uq;
-		}
-
-		s = lookup_branch(from);
-		if (b == s)
-			die("Can't create a branch from itself: %s", b->name);
-		else if (s) {
-			memcpy(b->sha1, s->sha1, 20);
-			memcpy(b->branch_tree.sha1, s->branch_tree.sha1, 20);
-		} else if (*from == ':') {
-			unsigned long idnum = strtoul(from + 1, NULL, 10);
-			struct object_entry *oe = find_mark(idnum);
-			unsigned long size;
-			char *buf;
-			if (oe->type != OBJ_COMMIT)
-				die("Mark :%lu not a commit", idnum);
-			memcpy(b->sha1, oe->sha1, 20);
-			buf = unpack_entry(oe->offset, &size);
-			if (!buf || size < 46)
-				die("Not a valid commit: %s", from);
-			if (memcmp("tree ", buf, 5)
-				|| get_sha1_hex(buf + 5, b->branch_tree.sha1))
-				die("The commit %s is corrupt", sha1_to_hex(b->sha1));
-			free(buf);
-		} else if (!get_sha1(from, b->sha1)) {
-			if (!memcmp(b->sha1, null_sha1, 20))
-				memcpy(b->branch_tree.sha1, null_sha1, 20);
-			else {
-				unsigned long size;
-				char *buf;
-
-				buf = read_object_with_reference(b->sha1,
-					type_names[OBJ_COMMIT], &size, b->sha1);
-				if (!buf || size < 46)
-					die("Not a valid commit: %s", from);
-				if (memcmp("tree ", buf, 5)
-					|| get_sha1_hex(buf + 5, b->branch_tree.sha1))
-					die("The commit %s is corrupt", sha1_to_hex(b->sha1));
-				free(buf);
-			}
-		} else
-			die("Invalid ref name or SHA1 expression: %s", from);
-
-		if (str_uq)
-			free(str_uq);
-		read_next_command();
-	} else {
-		memcpy(b->sha1, null_sha1, 20);
-		memcpy(b->branch_tree.sha1, null_sha1, 20);
-	}
-
-	if (command_buf.eof || command_buf.len > 1)
-		die("An lf did not terminate the branch command as expected.");
-}
-
 static void cmd_new_tag()
 {
 	char *str_uq;
@@ -1623,8 +1598,6 @@ int main(int argc, const char **argv)
 			break;
 		else if (!strcmp("blob", command_buf.buf))
 			cmd_new_blob();
-		else if (!strncmp("branch ", command_buf.buf, 7))
-			cmd_new_branch();
 		else if (!strncmp("commit ", command_buf.buf, 7))
 			cmd_new_commit();
 		else if (!strncmp("tag ", command_buf.buf, 4))

From 02f3389d9647378ed864ff1cdfb6f0238b64ee91 Mon Sep 17 00:00:00 2001
From: "Shawn O. Pearce" <spearce@spearce.org>
Date: Thu, 24 Aug 2006 22:38:13 -0400
Subject: [PATCH 20/81] Moved from command to after data to help cvs2svn.

cvs2svn has three phases: begin_commit, middle_commit, end_commit.
The ancester is computed in the middle_commit phase. So its easier
to generate a stream if the from command appears after the commit
message itself but before the file change commands.

Signed-off-by: Shawn O. Pearce <spearce@spearce.org>
---
 fast-import.c | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/fast-import.c b/fast-import.c
index 3e527edf70..1842d0738b 100644
--- a/fast-import.c
+++ b/fast-import.c
@@ -14,11 +14,11 @@ Format of STDIN stream:
   file_content ::= data;
 
   new_commit ::= 'commit' sp ref_str lf
-    ('from' sp (ref_str | hexsha1 | sha1exp_str | idnum) lf)?
     mark?
     ('author' sp name '<' email '>' ts tz lf)?
     'committer' sp name '<' email '>' ts tz lf
     commit_msg
+    ('from' sp (ref_str | hexsha1 | sha1exp_str | idnum) lf)?
     file_change*
     lf;
   commit_msg ::= data;
@@ -1385,7 +1385,6 @@ static void cmd_new_commit()
 		free(str_uq);
 
 	read_next_command();
-	cmd_from(b);
 	cmd_mark();
 	if (!strncmp("author ", command_buf.buf, 7)) {
 		author = strdup(command_buf.buf);
@@ -1398,6 +1397,8 @@ static void cmd_new_commit()
 	if (!committer)
 		die("Expected committer but didn't get one");
 	msg = cmd_data(&msglen);
+	read_next_command();
+	cmd_from(b);
 
 	/* ensure the branch is active/loaded */
 	if (!b->branch_tree.tree || !max_active_branches) {
@@ -1407,7 +1408,6 @@ static void cmd_new_commit()
 
 	/* file_change* */
 	for (;;) {
-		read_next_command();
 		if (1 == command_buf.len)
 			break;
 		else if (!strncmp("M ", command_buf.buf, 2))
@@ -1416,6 +1416,7 @@ static void cmd_new_commit()
 			file_change_d(b);
 		else
 			die("Unsupported file_change: %s", command_buf.buf);
+		read_next_command();
 	}
 
 	/* build the tree and the commit */

From 8435a9cb2662ca4326e96ea78d58b9376fb21f7e Mon Sep 17 00:00:00 2001
From: "Shawn O. Pearce" <spearce@spearce.org>
Date: Fri, 25 Aug 2006 14:53:32 -0400
Subject: [PATCH 21/81] Account for tree entry memory costs in fast-import.

Signed-off-by: Shawn O. Pearce <spearce@spearce.org>
---
 fast-import.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/fast-import.c b/fast-import.c
index 1842d0738b..311db4e6d5 100644
--- a/fast-import.c
+++ b/fast-import.c
@@ -516,6 +516,7 @@ static struct tree_entry* new_tree_entry()
 
 	if (!avail_tree_entry) {
 		unsigned int n = tree_entry_alloc;
+		total_allocd += n * sizeof(struct tree_entry);
 		avail_tree_entry = e = xmalloc(n * sizeof(struct tree_entry));
 		while (n--) {
 			*((void**)e) = e + 1;

From a6a1a831d9bdcdc0adb9a23ce450db08779c2871 Mon Sep 17 00:00:00 2001
From: "Shawn O. Pearce" <spearce@spearce.org>
Date: Fri, 25 Aug 2006 16:03:04 -0400
Subject: [PATCH 22/81] Added option to export the marks table when fast-import
 terminates.

The marks table can be used by the frontend to load any commit after
the import and compare it to whatever data the frontend knows about
that commit.  If the mark idnums can be easily correlated to some
reference source then its relatively trivial to compare the GIT
tree to the reference to verify the accuracy of the import.

Signed-off-by: Shawn O. Pearce <spearce@spearce.org>
---
 fast-import.c | 36 +++++++++++++++++++++++++++++++++++-
 1 file changed, 35 insertions(+), 1 deletion(-)

diff --git a/fast-import.c b/fast-import.c
index 311db4e6d5..d61da3adec 100644
--- a/fast-import.c
+++ b/fast-import.c
@@ -223,6 +223,7 @@ static unsigned int object_entry_alloc = 1000;
 static struct object_entry_pool *blocks;
 static struct object_entry *object_table[1 << 16];
 static struct mark_set *marks;
+static const char* mark_file;
 
 /* Our last blob */
 static struct last_object last_blob;
@@ -1122,6 +1123,36 @@ static void dump_tags()
 	}
 }
 
+static void dump_marks_helper(FILE *f,
+	unsigned long base,
+	struct mark_set *m)
+{
+	int k;
+	if (m->shift) {
+		for (k = 0; k < 1024; k++) {
+			if (m->data.sets[k])
+				dump_marks_helper(f, (base + k) << m->shift,
+					m->data.sets[k]);
+		}
+	} else {
+		for (k = 0; k < 1024; k++) {
+			if (m->data.marked[k])
+				fprintf(f, "%lu,%s\n", base + k,
+					sha1_to_hex(m->data.marked[k]->sha1));
+		}
+	}
+}
+
+static void dump_marks()
+{
+	if (mark_file)
+	{
+		FILE *f = fopen(mark_file, "w");
+		dump_marks_helper(f, 0, marks);
+		fclose(f);
+	}
+}
+
 static void read_next_command()
 {
 	read_line(&command_buf, stdin, '\n');
@@ -1544,7 +1575,7 @@ static void cmd_new_tag()
 }
 
 static const char fast_import_usage[] =
-"git-fast-import [--objects=n] [--depth=n] [--active-branches=n] temp.pack";
+"git-fast-import [--objects=n] [--depth=n] [--active-branches=n] [--export-marks=marks.file] temp.pack";
 
 int main(int argc, const char **argv)
 {
@@ -1569,6 +1600,8 @@ int main(int argc, const char **argv)
 			max_depth = strtoul(a + 8, NULL, 0);
 		else if (!strncmp(a, "--active-branches=", 18))
 			max_active_branches = strtoul(a + 18, NULL, 0);
+		else if (!strncmp(a, "--export-marks=", 15))
+			mark_file = a + 15;
 		else
 			die("unknown option %s", a);
 	}
@@ -1613,6 +1646,7 @@ int main(int argc, const char **argv)
 	write_index(idx_name);
 	dump_branches();
 	dump_tags();
+	dump_marks();
 
 	fprintf(stderr, "%s statistics:\n", argv[0]);
 	fprintf(stderr, "---------------------------------------------------\n");

From 264244a0429e23616a6065f6f52a15711981a8db Mon Sep 17 00:00:00 2001
From: "Shawn O. Pearce" <spearce@spearce.org>
Date: Fri, 25 Aug 2006 23:07:06 -0400
Subject: [PATCH 23/81] Added --branch-log to option to fast-import.

This option can be used to have a record of every commit, the mark
(if supplied) and branch name of the commit recorded into a log file
when the commit is generated.  This log can be useful to verify the
results of an import as the commits can be compared to some source
repository matching commits through the mark value.

Signed-off-by: Shawn O. Pearce <spearce@spearce.org>
---
 fast-import.c | 42 +++++++++++++++++++++++++++++++++++++-----
 1 file changed, 37 insertions(+), 5 deletions(-)

diff --git a/fast-import.c b/fast-import.c
index d61da3adec..8328e004bb 100644
--- a/fast-import.c
+++ b/fast-import.c
@@ -248,6 +248,7 @@ static struct tag *last_tag;
 /* Input stream parsing */
 static struct strbuf command_buf;
 static unsigned long next_mark;
+static FILE* branch_log;
 
 
 static void alloc_objects(int cnt)
@@ -1137,7 +1138,7 @@ static void dump_marks_helper(FILE *f,
 	} else {
 		for (k = 0; k < 1024; k++) {
 			if (m->data.marked[k])
-				fprintf(f, "%lu,%s\n", base + k,
+				fprintf(f, ":%lu %s\n", base + k,
 					sha1_to_hex(m->data.marked[k]->sha1));
 		}
 	}
@@ -1476,6 +1477,18 @@ static void cmd_new_commit()
 	store_object(OBJ_COMMIT, body, sp - body, NULL, b->sha1, next_mark);
 	free(body);
 	b->last_commit = object_count_by_type[OBJ_COMMIT];
+
+	if (branch_log) {
+		int need_dq = quote_c_style(b->name, NULL, NULL, 0);
+		fprintf(branch_log, "commit ");
+		if (need_dq) {
+			fputc('"', branch_log);
+			quote_c_style(b->name, NULL, branch_log, 0);
+			fputc('"', branch_log);
+		} else
+			fprintf(branch_log, "%s", b->name);
+		fprintf(branch_log," :%lu %s\n",next_mark,sha1_to_hex(b->sha1));
+	}
 }
 
 static void cmd_new_tag()
@@ -1490,6 +1503,7 @@ static void cmd_new_tag()
 	size_t msglen;
 	char *body;
 	struct tag *t;
+	unsigned long from_mark = 0;
 	unsigned char sha1[20];
 
 	/* Obtain the new tag name from the rest of our command */
@@ -1528,10 +1542,10 @@ static void cmd_new_tag()
 	if (s) {
 		memcpy(sha1, s->sha1, 20);
 	} else if (*from == ':') {
-		unsigned long idnum = strtoul(from + 1, NULL, 10);
-		struct object_entry *oe = find_mark(idnum);
+		from_mark = strtoul(from + 1, NULL, 10);
+		struct object_entry *oe = find_mark(from_mark);
 		if (oe->type != OBJ_COMMIT)
-			die("Mark :%lu not a commit", idnum);
+			die("Mark :%lu not a commit", from_mark);
 		memcpy(sha1, oe->sha1, 20);
 	} else if (!get_sha1(from, sha1)) {
 		unsigned long size;
@@ -1572,10 +1586,22 @@ static void cmd_new_tag()
 
 	store_object(OBJ_TAG, body, sp - body, NULL, t->sha1, 0);
 	free(body);
+
+	if (branch_log) {
+		int need_dq = quote_c_style(t->name, NULL, NULL, 0);
+		fprintf(branch_log, "tag ");
+		if (need_dq) {
+			fputc('"', branch_log);
+			quote_c_style(t->name, NULL, branch_log, 0);
+			fputc('"', branch_log);
+		} else
+			fprintf(branch_log, "%s", t->name);
+		fprintf(branch_log," :%lu %s\n",from_mark,sha1_to_hex(t->sha1));
+	}
 }
 
 static const char fast_import_usage[] =
-"git-fast-import [--objects=n] [--depth=n] [--active-branches=n] [--export-marks=marks.file] temp.pack";
+"git-fast-import [--objects=n] [--depth=n] [--active-branches=n] [--export-marks=marks.file] [--branch-log=log] temp.pack";
 
 int main(int argc, const char **argv)
 {
@@ -1602,6 +1628,11 @@ int main(int argc, const char **argv)
 			max_active_branches = strtoul(a + 18, NULL, 0);
 		else if (!strncmp(a, "--export-marks=", 15))
 			mark_file = a + 15;
+		else if (!strncmp(a, "--branch-log=", 13)) {
+			branch_log = fopen(a + 13, "w");
+			if (!branch_log)
+				die("Can't create %s: %s", a + 13, strerror(errno));
+		}
 		else
 			die("unknown option %s", a);
 	}
@@ -1647,6 +1678,7 @@ int main(int argc, const char **argv)
 	dump_branches();
 	dump_tags();
 	dump_marks();
+	fclose(branch_log);
 
 	fprintf(stderr, "%s statistics:\n", argv[0]);
 	fprintf(stderr, "---------------------------------------------------\n");

From 2eb26d8454de77f45bbbfc32eed2a6c3133fe963 Mon Sep 17 00:00:00 2001
From: "Shawn O. Pearce" <spearce@spearce.org>
Date: Sat, 26 Aug 2006 22:38:02 -0400
Subject: [PATCH 24/81] Fixed GPF in fast-import caused by unterminated linked
 list.

fast-import was encounting a GPF when it ran out of free tree_entry
objects but didn't know this was the cause because the last
tree_entry wasn't terminated with a NULL pointer.  The missing NULL
pointer occurred when we allocated additional entries via xmalloc
but didn't set the last tree_entry's "next" pointer to NULL.

Signed-off-by: Shawn O. Pearce <spearce@spearce.org>
---
 fast-import.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/fast-import.c b/fast-import.c
index 8328e004bb..194116be6f 100644
--- a/fast-import.c
+++ b/fast-import.c
@@ -520,10 +520,11 @@ static struct tree_entry* new_tree_entry()
 		unsigned int n = tree_entry_alloc;
 		total_allocd += n * sizeof(struct tree_entry);
 		avail_tree_entry = e = xmalloc(n * sizeof(struct tree_entry));
-		while (n--) {
+		while (n-- > 1) {
 			*((void**)e) = e + 1;
 			e++;
 		}
+		*((void*)e) = NULL;
 	}
 
 	e = avail_tree_entry;

From 35ef237cf630418c2e45752eb527268693a2895b Mon Sep 17 00:00:00 2001
From: "Shawn O. Pearce" <spearce@spearce.org>
Date: Sat, 26 Aug 2006 23:37:31 -0400
Subject: [PATCH 25/81] Fixed compile error in fast-import.

Signed-off-by: Shawn O. Pearce <spearce@spearce.org>
---
 fast-import.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/fast-import.c b/fast-import.c
index 194116be6f..309f4d353b 100644
--- a/fast-import.c
+++ b/fast-import.c
@@ -524,7 +524,7 @@ static struct tree_entry* new_tree_entry()
 			*((void**)e) = e + 1;
 			e++;
 		}
-		*((void*)e) = NULL;
+		*((void**)e) = NULL;
 	}
 
 	e = avail_tree_entry;

From 53dbce78a2a018bd2828d3ecc4123015f88ae36f Mon Sep 17 00:00:00 2001
From: "Shawn O. Pearce" <spearce@spearce.org>
Date: Sun, 27 Aug 2006 05:53:48 -0400
Subject: [PATCH 26/81] Map only part of the generated pack file at any point
 in time.

When generating a very large pack file (for example close to 1 GB
in size) it may be impossible for the kernel to find a contiguous
free range within a 32 bit address space for the mapping to be
located at.  This is especially problematic on large imports where
there is a lot of malloc activity occuring within the same process
and the malloc'd regions may straddle the previously mapped regions,
thereby creating large holes in the address space.

So instead we map only 128 MB of the pack at any given time.
This will likely increase the number of times the file gets mapped
(with additional system time required to update the page tables
more frequently) but will allow the program to handle packs up to
4 GB in size.

Signed-off-by: Shawn O. Pearce <spearce@spearce.org>
---
 fast-import.c | 44 ++++++++++++++++++++++++--------------------
 1 file changed, 24 insertions(+), 20 deletions(-)

diff --git a/fast-import.c b/fast-import.c
index 309f4d353b..f3376c60ef 100644
--- a/fast-import.c
+++ b/fast-import.c
@@ -215,8 +215,10 @@ static struct atom_str **atom_table;
 static int pack_fd;
 static unsigned long pack_size;
 static unsigned char pack_sha1[20];
-static void* pack_base;
-static size_t pack_mlen;
+static unsigned char* pack_base;
+static unsigned long pack_moff;
+static unsigned long pack_mlen = 128*1024*1024;
+static unsigned long page_size;
 
 /* Table of objects we've written. */
 static unsigned int object_entry_alloc = 1000;
@@ -676,23 +678,26 @@ static int store_object(
 	return 0;
 }
 
-static void* map_pack(unsigned long offset)
+static unsigned char* map_pack(unsigned long offset, unsigned int *left)
 {
 	if (offset >= pack_size)
 		die("object offset outside of pack file");
-	if (offset >= pack_mlen) {
+	if (!pack_base
+			|| offset < pack_moff
+			|| (offset + 20) >= (pack_moff + pack_mlen)) {
 		if (pack_base)
 			munmap(pack_base, pack_mlen);
-		/* round out how much we map to 16 MB units */
-		pack_mlen = pack_size;
-		if (pack_mlen & ((1 << 24) - 1))
-			pack_mlen = ((pack_mlen >> 24) + 1) << 24;
-		pack_base = mmap(NULL,pack_mlen,PROT_READ,MAP_SHARED,pack_fd,0);
+		pack_moff = (offset / page_size) * page_size;
+		pack_base = mmap(NULL,pack_mlen,PROT_READ,MAP_SHARED,
+			pack_fd,pack_moff);
 		if (pack_base == MAP_FAILED)
 			die("Failed to map generated pack: %s", strerror(errno));
 		remap_count++;
 	}
-	return (char*)pack_base + offset;
+	offset -= pack_moff;
+	if (left)
+		*left = pack_mlen - offset;
+	return pack_base + offset;
 }
 
 static unsigned long unpack_object_header(unsigned long offset,
@@ -703,12 +708,12 @@ static unsigned long unpack_object_header(unsigned long offset,
 	unsigned char c;
 	unsigned long size;
 
-	c = *(unsigned char*)map_pack(offset++);
+	c = *map_pack(offset++, NULL);
 	*type = (c >> 4) & 7;
 	size = c & 15;
 	shift = 4;
 	while (c & 0x80) {
-		c = *(unsigned char*)map_pack(offset++);
+		c = *map_pack(offset++, NULL);
 		size += (c & 0x7f) << shift;
 		shift += 7;
 	}
@@ -725,8 +730,7 @@ static void *unpack_non_delta_entry(unsigned long o, unsigned long sz)
 	result[sz] = 0;
 
 	memset(&stream, 0, sizeof(stream));
-	stream.next_in = map_pack(o);
-	stream.avail_in = pack_mlen - o;
+	stream.next_in = map_pack(o, &stream.avail_in);
 	stream.next_out = result;
 	stream.avail_out = sz;
 
@@ -735,13 +739,12 @@ static void *unpack_non_delta_entry(unsigned long o, unsigned long sz)
 		int st = inflate(&stream, Z_FINISH);
 		if (st == Z_STREAM_END)
 			break;
-		if (st == Z_OK) {
-			o = stream.next_in - (unsigned char*)pack_base;
-			stream.next_in = map_pack(o);
-			stream.avail_in = pack_mlen - o;
+		if (st == Z_OK || st == Z_BUF_ERROR) {
+			o = stream.next_in - pack_base + pack_moff;
+			stream.next_in = map_pack(o, &stream.avail_in);
 			continue;
 		}
-		die("Error from zlib during inflate.");
+		die("Error %i from zlib during inflate.", st);
 	}
 	inflateEnd(&stream);
 	if (stream.total_out != sz)
@@ -760,7 +763,7 @@ static void *unpack_delta_entry(unsigned long offset,
 	void *delta_data, *base, *result;
 	unsigned long base_size, result_size;
 
-	base_sha1 = (unsigned char*)map_pack(offset + 20) - 20;
+	base_sha1 = map_pack(offset, NULL);
 	base_oe = find_object(base_sha1);
 	if (!base_oe)
 		die("I'm broken; I can't find a base I know must be here.");
@@ -1615,6 +1618,7 @@ int main(int argc, const char **argv)
 
 	setup_ident();
 	git_config(git_default_config);
+	page_size = getpagesize();
 
 	for (i = 1; i < argc; i++) {
 		const char *a = argv[i];

From 5fced8dc6f4844997b6e25a67a00f428775c5233 Mon Sep 17 00:00:00 2001
From: "Shawn O. Pearce" <spearce@spearce.org>
Date: Sun, 27 Aug 2006 06:20:49 -0400
Subject: [PATCH 27/81] Added 'reset' command to clear a branch's tree.

Sometimes an import frontend may need to work with a temporary branch
which will actually contain many different branches over the life
of the import.  This is especially useful when the frontend needs
to create a tag from a set of file versions which are otherwise
never a commit.

Signed-off-by: Shawn O. Pearce <spearce@spearce.org>
---
 fast-import.c | 32 ++++++++++++++++++++++++++++++++
 1 file changed, 32 insertions(+)

diff --git a/fast-import.c b/fast-import.c
index f3376c60ef..778b8bfdd4 100644
--- a/fast-import.c
+++ b/fast-import.c
@@ -6,6 +6,7 @@ Format of STDIN stream:
   cmd ::= new_blob
         | new_commit
         | new_tag
+        | reset_branch
         ;
 
   new_blob ::= 'blob' lf
@@ -34,6 +35,8 @@ Format of STDIN stream:
     tag_msg;
   tag_msg ::= data;
 
+  reset_branch ::= 'reset' sp ref_str lf;
+
      # note: the first idnum in a stream should be 1 and subsequent
      # idnums should not have gaps between values as this will cause
      # the stream parser to reserve space for the gapped values.  An
@@ -1604,6 +1607,33 @@ static void cmd_new_tag()
 	}
 }
 
+static void cmd_reset_branch()
+{
+	struct branch *b;
+	char *str_uq;
+	const char *endp;
+	char *sp;
+
+	/* Obtain the branch name from the rest of our command */
+	sp = strchr(command_buf.buf, ' ') + 1;
+	str_uq = unquote_c_style(sp, &endp);
+	if (str_uq) {
+		if (*endp)
+			die("Garbage after ref in: %s", command_buf.buf);
+		sp = str_uq;
+	}
+	b = lookup_branch(sp);
+	if (b) {
+		b->last_commit = 0;
+		if (b->branch_tree.tree) {
+			release_tree_content_recursive(b->branch_tree.tree);
+			b->branch_tree.tree = NULL;
+		}
+	}
+	if (str_uq)
+		free(str_uq);
+}
+
 static const char fast_import_usage[] =
 "git-fast-import [--objects=n] [--depth=n] [--active-branches=n] [--export-marks=marks.file] [--branch-log=log] temp.pack";
 
@@ -1673,6 +1703,8 @@ int main(int argc, const char **argv)
 			cmd_new_commit();
 		else if (!strncmp("tag ", command_buf.buf, 4))
 			cmd_new_tag();
+		else if (!strncmp("reset ", command_buf.buf, 6))
+			cmd_reset_branch();
 		else
 			die("Unsupported command: %s", command_buf.buf);
 	}

From 08d7e892a714dec8471cd45add2b1da24f66b3e7 Mon Sep 17 00:00:00 2001
From: "Shawn O. Pearce" <spearce@spearce.org>
Date: Sun, 27 Aug 2006 20:13:44 -0400
Subject: [PATCH 28/81] Don't crash fast-import if no branch log was requested.

Signed-off-by: Shawn O. Pearce <spearce@spearce.org>
---
 fast-import.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/fast-import.c b/fast-import.c
index 778b8bfdd4..5376b5e15c 100644
--- a/fast-import.c
+++ b/fast-import.c
@@ -1715,7 +1715,8 @@ int main(int argc, const char **argv)
 	dump_branches();
 	dump_tags();
 	dump_marks();
-	fclose(branch_log);
+	if (branch_log)
+		fclose(branch_log);
 
 	fprintf(stderr, "%s statistics:\n", argv[0]);
 	fprintf(stderr, "---------------------------------------------------\n");

From 445b85999a309c8e5c7f928484c57325c280152e Mon Sep 17 00:00:00 2001
From: "Shawn O. Pearce" <spearce@spearce.org>
Date: Mon, 28 Aug 2006 10:46:58 -0400
Subject: [PATCH 29/81] Converted hash memcpy/memcmp to new
 hashcpy/hashcmp/hashclr.

Signed-off-by: Shawn O. Pearce <spearce@spearce.org>
---
 fast-import.c | 52 +++++++++++++++++++++++++--------------------------
 1 file changed, 26 insertions(+), 26 deletions(-)

diff --git a/fast-import.c b/fast-import.c
index 5376b5e15c..b1b2382560 100644
--- a/fast-import.c
+++ b/fast-import.c
@@ -277,7 +277,7 @@ static struct object_entry* new_object(unsigned char *sha1)
 		alloc_objects(object_entry_alloc);
 
 	e = blocks->next_free++;
-	memcpy(e->sha1, sha1, sizeof(e->sha1));
+	hashcpy(e->sha1, sha1);
 	return e;
 }
 
@@ -286,7 +286,7 @@ static struct object_entry* find_object(unsigned char *sha1)
 	unsigned int h = sha1[0] << 8 | sha1[1];
 	struct object_entry *e;
 	for (e = object_table[h]; e; e = e->next)
-		if (!memcmp(sha1, e->sha1, sizeof(e->sha1)))
+		if (!hashcmp(sha1, e->sha1))
 			return e;
 	return NULL;
 }
@@ -298,7 +298,7 @@ static struct object_entry* insert_object(unsigned char *sha1)
 	struct object_entry *p = NULL;
 
 	while (e) {
-		if (!memcmp(sha1, e->sha1, sizeof(e->sha1)))
+		if (!hashcmp(sha1, e->sha1))
 			return e;
 		p = e;
 		e = e->next;
@@ -616,7 +616,7 @@ static int store_object(
 	SHA1_Update(&c, dat, datlen);
 	SHA1_Final(sha1, &c);
 	if (sha1out)
-		memcpy(sha1out, sha1, sizeof(sha1));
+		hashcpy(sha1out, sha1);
 
 	e = insert_object(sha1);
 	if (mark)
@@ -676,7 +676,7 @@ static int store_object(
 			free(last->data);
 		last->data = dat;
 		last->len = datlen;
-		memcpy(last->sha1, sha1, sizeof(sha1));
+		hashcpy(last->sha1, sha1);
 	}
 	return 0;
 }
@@ -826,7 +826,7 @@ static void load_tree(struct tree_entry *root)
 	const char *c;
 
 	root->tree = t = new_tree_content(8);
-	if (!memcmp(root->sha1, null_sha1, 20))
+	if (is_null_sha1(root->sha1))
 		return;
 
 	myoe = find_object(root->sha1);
@@ -855,7 +855,7 @@ static void load_tree(struct tree_entry *root)
 			die("Corrupt mode in %s", sha1_to_hex(root->sha1));
 		e->name = to_atom(c, strlen(c));
 		c += e->name->str_len + 1;
-		memcpy(e->sha1, c, sizeof(e->sha1));
+		hashcpy(e->sha1, c);
 		c += 20;
 	}
 	free(buf);
@@ -877,7 +877,7 @@ static void store_tree(struct tree_entry *root)
 	size_t maxlen;
 	char *buf, *c;
 
-	if (memcmp(root->sha1, null_sha1, 20))
+	if (!is_null_sha1(root->sha1))
 		return;
 
 	maxlen = 0;
@@ -895,7 +895,7 @@ static void store_tree(struct tree_entry *root)
 		*c++ = ' ';
 		strcpy(c, e->name->str_dat);
 		c += e->name->str_len + 1;
-		memcpy(c, e->sha1, 20);
+		hashcpy(c, e->sha1);
 		c += 20;
 	}
 	store_object(OBJ_TREE, buf, c - buf, NULL, root->sha1, 0);
@@ -923,15 +923,15 @@ static int tree_content_set(
 		e = t->entries[i];
 		if (e->name->str_len == n && !strncmp(p, e->name->str_dat, n)) {
 			if (!slash1) {
-				if (e->mode == mode && !memcmp(e->sha1, sha1, 20))
+				if (e->mode == mode && !hashcmp(e->sha1, sha1))
 					return 0;
 				e->mode = mode;
-				memcpy(e->sha1, sha1, 20);
+				hashcpy(e->sha1, sha1);
 				if (e->tree) {
 					release_tree_content_recursive(e->tree);
 					e->tree = NULL;
 				}
-				memcpy(root->sha1, null_sha1, 20);
+				hashclr(root->sha1);
 				return 1;
 			}
 			if (!S_ISDIR(e->mode)) {
@@ -941,7 +941,7 @@ static int tree_content_set(
 			if (!e->tree)
 				load_tree(e);
 			if (tree_content_set(e, slash1 + 1, sha1, mode)) {
-				memcpy(root->sha1, null_sha1, 20);
+				hashclr(root->sha1);
 				return 1;
 			}
 			return 0;
@@ -960,9 +960,9 @@ static int tree_content_set(
 	} else {
 		e->tree = NULL;
 		e->mode = mode;
-		memcpy(e->sha1, sha1, 20);
+		hashcpy(e->sha1, sha1);
 	}
-	memcpy(root->sha1, null_sha1, 20);
+	hashclr(root->sha1);
 	return 1;
 }
 
@@ -989,7 +989,7 @@ static int tree_content_remove(struct tree_entry *root, const char *p)
 			if (tree_content_remove(e, slash1 + 1)) {
 				if (!e->tree->entry_count)
 					goto del_entry;
-				memcpy(root->sha1, null_sha1, 20);
+				hashclr(root->sha1);
 				return 1;
 			}
 			return 0;
@@ -1002,7 +1002,7 @@ del_entry:
 		t->entries[i-1] = t->entries[i];
 	t->entry_count--;
 	release_tree_entry(e);
-	memcpy(root->sha1, null_sha1, 20);
+	hashclr(root->sha1);
 	return 1;
 }
 
@@ -1054,7 +1054,7 @@ static int oecmp (const void *_a, const void *_b)
 {
 	struct object_entry *a = *((struct object_entry**)_a);
 	struct object_entry *b = *((struct object_entry**)_b);
-	return memcmp(a->sha1, b->sha1, sizeof(a->sha1));
+	return hashcmp(a->sha1, b->sha1);
 }
 
 static void write_index(const char *idx_name)
@@ -1359,8 +1359,8 @@ static void cmd_from(struct branch *b)
 	if (b == s)
 		die("Can't create a branch from itself: %s", b->name);
 	else if (s) {
-		memcpy(b->sha1, s->sha1, 20);
-		memcpy(b->branch_tree.sha1, s->branch_tree.sha1, 20);
+		hashcpy(b->sha1, s->sha1);
+		hashcpy(b->branch_tree.sha1, s->branch_tree.sha1);
 	} else if (*from == ':') {
 		unsigned long idnum = strtoul(from + 1, NULL, 10);
 		struct object_entry *oe = find_mark(idnum);
@@ -1368,7 +1368,7 @@ static void cmd_from(struct branch *b)
 		char *buf;
 		if (oe->type != OBJ_COMMIT)
 			die("Mark :%lu not a commit", idnum);
-		memcpy(b->sha1, oe->sha1, 20);
+		hashcpy(b->sha1, oe->sha1);
 		buf = unpack_entry(oe->offset, &size);
 		if (!buf || size < 46)
 			die("Not a valid commit: %s", from);
@@ -1377,8 +1377,8 @@ static void cmd_from(struct branch *b)
 			die("The commit %s is corrupt", sha1_to_hex(b->sha1));
 		free(buf);
 	} else if (!get_sha1(from, b->sha1)) {
-		if (!memcmp(b->sha1, null_sha1, 20))
-			memcpy(b->branch_tree.sha1, null_sha1, 20);
+		if (is_null_sha1(b->sha1))
+			hashclr(b->branch_tree.sha1);
 		else {
 			unsigned long size;
 			char *buf;
@@ -1467,7 +1467,7 @@ static void cmd_new_commit()
 			: 2 * strlen(committer)));
 	sp = body;
 	sp += sprintf(sp, "tree %s\n", sha1_to_hex(b->branch_tree.sha1));
-	if (memcmp(b->sha1, null_sha1, 20))
+	if (!is_null_sha1(b->sha1))
 		sp += sprintf(sp, "parent %s\n", sha1_to_hex(b->sha1));
 	if (author)
 		sp += sprintf(sp, "%s\n", author);
@@ -1547,13 +1547,13 @@ static void cmd_new_tag()
 
 	s = lookup_branch(from);
 	if (s) {
-		memcpy(sha1, s->sha1, 20);
+		hashcpy(sha1, s->sha1);
 	} else if (*from == ':') {
 		from_mark = strtoul(from + 1, NULL, 10);
 		struct object_entry *oe = find_mark(from_mark);
 		if (oe->type != OBJ_COMMIT)
 			die("Mark :%lu not a commit", from_mark);
-		memcpy(sha1, oe->sha1, 20);
+		hashcpy(sha1, oe->sha1);
 	} else if (!get_sha1(from, sha1)) {
 		unsigned long size;
 		char *buf;

From 4cabf8583f934260697a065186f3dce135834ede Mon Sep 17 00:00:00 2001
From: "Shawn O. Pearce" <spearce@spearce.org>
Date: Mon, 28 Aug 2006 12:22:50 -0400
Subject: [PATCH 30/81] Implemented tree delta compression in fast-import.

We now store for every tree entry two modes and two sha1 values;
the base (aka "version 0") and the current/new (aka "version 1").
When we generate a tree object we also regenerate the prior version
object and use that as our base object for a delta.  This strategy
saves a significant amount of memory as we can continue to use the
atom pool for file/directory names and only increases each tree
entry by an additional 24 bytes of memory.

Branches should automatically delta against their ancestor tree,
unless the ancestor tree is already at the delta chain limit.

Signed-off-by: Shawn O. Pearce <spearce@spearce.org>
---
 fast-import.c | 228 +++++++++++++++++++++++++++++++++++---------------
 1 file changed, 159 insertions(+), 69 deletions(-)

diff --git a/fast-import.c b/fast-import.c
index b1b2382560..6b01120415 100644
--- a/fast-import.c
+++ b/fast-import.c
@@ -132,7 +132,7 @@ struct mark_set
 struct last_object
 {
 	void *data;
-	unsigned int len;
+	unsigned long len;
 	unsigned int depth;
 	unsigned char sha1[20];
 };
@@ -157,14 +157,18 @@ struct tree_entry
 {
 	struct tree_content *tree;
 	struct atom_str* name;
-	unsigned int mode;
-	unsigned char sha1[20];
+	struct tree_entry_ms
+	{
+		unsigned int mode;
+		unsigned char sha1[20];
+	} versions[2];
 };
 
 struct tree_content
 {
 	unsigned int entry_capacity; /* must match avail_tree_content */
 	unsigned int entry_count;
+	unsigned int delta_depth;
 	struct tree_entry *entries[FLEX_ARRAY]; /* more */
 };
 
@@ -203,6 +207,7 @@ static unsigned long duplicate_count;
 static unsigned long marks_set_count;
 static unsigned long object_count_by_type[9];
 static unsigned long duplicate_count_by_type[9];
+static unsigned long delta_count_by_type[9];
 
 /* Memory pools */
 static size_t mem_pool_alloc = 2*1024*1024 - sizeof(struct mem_pool);
@@ -224,7 +229,7 @@ static unsigned long pack_mlen = 128*1024*1024;
 static unsigned long page_size;
 
 /* Table of objects we've written. */
-static unsigned int object_entry_alloc = 1000;
+static unsigned int object_entry_alloc = 5000;
 static struct object_entry_pool *blocks;
 static struct object_entry *object_table[1 << 16];
 static struct mark_set *marks;
@@ -486,6 +491,7 @@ static struct tree_content* new_tree_content(unsigned int cnt)
 
 	t = (struct tree_content*)f;
 	t->entry_count = 0;
+	t->delta_depth = 0;
 	return t;
 }
 
@@ -512,6 +518,7 @@ static struct tree_content* grow_tree_content(
 {
 	struct tree_content *r = new_tree_content(t->entry_count + amt);
 	r->entry_count = t->entry_count;
+	r->delta_depth = t->delta_depth;
 	memcpy(r->entries,t->entries,t->entry_count*sizeof(t->entries[0]));
 	release_tree_content(t);
 	return r;
@@ -642,6 +649,7 @@ static int store_object(
 	deflateInit(&s, zlib_compression_level);
 
 	if (delta) {
+		delta_count_by_type[type]++;
 		last->depth++;
 		s.next_in = delta;
 		s.avail_in = deltalen;
@@ -755,11 +763,14 @@ static void *unpack_non_delta_entry(unsigned long o, unsigned long sz)
 	return result;
 }
 
-static void *unpack_entry(unsigned long offset, unsigned long *sizep);
+static void *unpack_entry(unsigned long offset,
+	unsigned long *sizep,
+	unsigned int *delta_depth);
 
 static void *unpack_delta_entry(unsigned long offset,
 	unsigned long delta_size,
-	unsigned long *sizep)
+	unsigned long *sizep,
+	unsigned int *delta_depth)
 {
 	struct object_entry *base_oe;
 	unsigned char *base_sha1;
@@ -770,7 +781,7 @@ static void *unpack_delta_entry(unsigned long offset,
 	base_oe = find_object(base_sha1);
 	if (!base_oe)
 		die("I'm broken; I can't find a base I know must be here.");
-	base = unpack_entry(base_oe->offset, &base_size);
+	base = unpack_entry(base_oe->offset, &base_size, delta_depth);
 	delta_data = unpack_non_delta_entry(offset + 20, delta_size);
 	result = patch_delta(base, base_size,
 			     delta_data, delta_size,
@@ -780,10 +791,13 @@ static void *unpack_delta_entry(unsigned long offset,
 	free(delta_data);
 	free(base);
 	*sizep = result_size;
+	(*delta_depth)++;
 	return result;
 }
 
-static void *unpack_entry(unsigned long offset, unsigned long *sizep)
+static void *unpack_entry(unsigned long offset,
+	unsigned long *sizep,
+	unsigned int *delta_depth)
 {
 	unsigned long size;
 	enum object_type kind;
@@ -791,12 +805,13 @@ static void *unpack_entry(unsigned long offset, unsigned long *sizep)
 	offset = unpack_object_header(offset, &kind, &size);
 	switch (kind) {
 	case OBJ_DELTA:
-		return unpack_delta_entry(offset, size, sizep);
+		return unpack_delta_entry(offset, size, sizep, delta_depth);
 	case OBJ_COMMIT:
 	case OBJ_TREE:
 	case OBJ_BLOB:
 	case OBJ_TAG:
 		*sizep = size;
+		*delta_depth = 0;
 		return unpack_non_delta_entry(offset, size);
 	default:
 		die("I created an object I can't read!");
@@ -819,6 +834,7 @@ static const char *get_mode(const char *str, unsigned int *modep)
 
 static void load_tree(struct tree_entry *root)
 {
+	unsigned char* sha1 = root->versions[1].sha1;
 	struct object_entry *myoe;
 	struct tree_content *t;
 	unsigned long size;
@@ -826,19 +842,19 @@ static void load_tree(struct tree_entry *root)
 	const char *c;
 
 	root->tree = t = new_tree_content(8);
-	if (is_null_sha1(root->sha1))
+	if (is_null_sha1(sha1))
 		return;
 
-	myoe = find_object(root->sha1);
+	myoe = find_object(sha1);
 	if (myoe) {
 		if (myoe->type != OBJ_TREE)
-			die("Not a tree: %s", sha1_to_hex(root->sha1));
-		buf = unpack_entry(myoe->offset, &size);
+			die("Not a tree: %s", sha1_to_hex(sha1));
+		buf = unpack_entry(myoe->offset, &size, &t->delta_depth);
 	} else {
 		char type[20];
-		buf = read_sha1_file(root->sha1, type, &size);
+		buf = read_sha1_file(sha1, type, &size);
 		if (!buf || strcmp(type, tree_type))
-			die("Can't load tree %s", sha1_to_hex(root->sha1));
+			die("Can't load tree %s", sha1_to_hex(sha1));
 	}
 
 	c = buf;
@@ -850,56 +866,116 @@ static void load_tree(struct tree_entry *root)
 		t->entries[t->entry_count++] = e;
 
 		e->tree = NULL;
-		c = get_mode(c, &e->mode);
+		c = get_mode(c, &e->versions[1].mode);
 		if (!c)
-			die("Corrupt mode in %s", sha1_to_hex(root->sha1));
+			die("Corrupt mode in %s", sha1_to_hex(sha1));
+		e->versions[0].mode = e->versions[1].mode;
 		e->name = to_atom(c, strlen(c));
 		c += e->name->str_len + 1;
-		hashcpy(e->sha1, c);
+		hashcpy(e->versions[0].sha1, (unsigned char*)c);
+		hashcpy(e->versions[1].sha1, (unsigned char*)c);
 		c += 20;
 	}
 	free(buf);
 }
 
-static int tecmp (const void *_a, const void *_b)
+static int tecmp0 (const void *_a, const void *_b)
 {
 	struct tree_entry *a = *((struct tree_entry**)_a);
 	struct tree_entry *b = *((struct tree_entry**)_b);
 	return base_name_compare(
-		a->name->str_dat, a->name->str_len, a->mode,
-		b->name->str_dat, b->name->str_len, b->mode);
+		a->name->str_dat, a->name->str_len, a->versions[0].mode,
+		b->name->str_dat, b->name->str_len, b->versions[0].mode);
+}
+
+static int tecmp1 (const void *_a, const void *_b)
+{
+	struct tree_entry *a = *((struct tree_entry**)_a);
+	struct tree_entry *b = *((struct tree_entry**)_b);
+	return base_name_compare(
+		a->name->str_dat, a->name->str_len, a->versions[1].mode,
+		b->name->str_dat, b->name->str_len, b->versions[1].mode);
+}
+
+static void* mktree(struct tree_content *t, int v, unsigned long *szp)
+{
+	size_t maxlen = 0;
+	unsigned int i;
+	char *buf, *c;
+
+	if (!v)
+		qsort(t->entries,t->entry_count,sizeof(t->entries[0]),tecmp0);
+	else
+		qsort(t->entries,t->entry_count,sizeof(t->entries[0]),tecmp1);
+
+	for (i = 0; i < t->entry_count; i++) {
+		if (t->entries[i]->versions[v].mode)
+			maxlen += t->entries[i]->name->str_len + 34;
+	}
+
+	buf = c = xmalloc(maxlen);
+	for (i = 0; i < t->entry_count; i++) {
+		struct tree_entry *e = t->entries[i];
+		if (!e->versions[v].mode)
+			continue;
+		c += sprintf(c, "%o", e->versions[v].mode);
+		*c++ = ' ';
+		strcpy(c, e->name->str_dat);
+		c += e->name->str_len + 1;
+		hashcpy((unsigned char*)c, e->versions[v].sha1);
+		c += 20;
+	}
+
+	*szp = c - buf;
+	return buf;
 }
 
 static void store_tree(struct tree_entry *root)
 {
 	struct tree_content *t = root->tree;
-	unsigned int i;
-	size_t maxlen;
-	char *buf, *c;
+	unsigned int i, j, del;
+	unsigned long vers1len;
+	void **vers1dat;
+	struct last_object lo;
 
-	if (!is_null_sha1(root->sha1))
+	if (!is_null_sha1(root->versions[1].sha1))
 		return;
 
-	maxlen = 0;
 	for (i = 0; i < t->entry_count; i++) {
-		maxlen += t->entries[i]->name->str_len + 34;
 		if (t->entries[i]->tree)
 			store_tree(t->entries[i]);
 	}
 
-	qsort(t->entries, t->entry_count, sizeof(t->entries[0]), tecmp);
-	buf = c = xmalloc(maxlen);
-	for (i = 0; i < t->entry_count; i++) {
-		struct tree_entry *e = t->entries[i];
-		c += sprintf(c, "%o", e->mode);
-		*c++ = ' ';
-		strcpy(c, e->name->str_dat);
-		c += e->name->str_len + 1;
-		hashcpy(c, e->sha1);
-		c += 20;
+	if (is_null_sha1(root->versions[0].sha1)
+			|| !find_object(root->versions[0].sha1)) {
+		lo.data = NULL;
+		lo.depth = 0;
+	} else {
+		lo.data = mktree(t, 0, &lo.len);
+		lo.depth = t->delta_depth;
+		hashcpy(lo.sha1, root->versions[0].sha1);
 	}
-	store_object(OBJ_TREE, buf, c - buf, NULL, root->sha1, 0);
-	free(buf);
+	vers1dat = mktree(t, 1, &vers1len);
+
+	store_object(OBJ_TREE, vers1dat, vers1len,
+		&lo, root->versions[1].sha1, 0);
+	/* note: lo.dat (if created) was freed by store_object */
+	free(vers1dat);
+
+	t->delta_depth = lo.depth;
+	hashcpy(root->versions[0].sha1, root->versions[1].sha1);
+	for (i = 0, j = 0, del = 0; i < t->entry_count; i++) {
+		struct tree_entry *e = t->entries[i];
+		if (e->versions[1].mode) {
+			e->versions[0].mode = e->versions[1].mode;
+			hashcpy(e->versions[0].sha1, e->versions[1].sha1);
+			t->entries[j++] = e;
+		} else {
+			release_tree_entry(e);
+			del++;
+		}
+	}
+	t->entry_count -= del;
 }
 
 static int tree_content_set(
@@ -923,25 +999,26 @@ static int tree_content_set(
 		e = t->entries[i];
 		if (e->name->str_len == n && !strncmp(p, e->name->str_dat, n)) {
 			if (!slash1) {
-				if (e->mode == mode && !hashcmp(e->sha1, sha1))
+				if (e->versions[1].mode == mode
+						&& !hashcmp(e->versions[1].sha1, sha1))
 					return 0;
-				e->mode = mode;
-				hashcpy(e->sha1, sha1);
+				e->versions[1].mode = mode;
+				hashcpy(e->versions[1].sha1, sha1);
 				if (e->tree) {
 					release_tree_content_recursive(e->tree);
 					e->tree = NULL;
 				}
-				hashclr(root->sha1);
+				hashclr(root->versions[1].sha1);
 				return 1;
 			}
-			if (!S_ISDIR(e->mode)) {
+			if (!S_ISDIR(e->versions[1].mode)) {
 				e->tree = new_tree_content(8);
-				e->mode = S_IFDIR;
+				e->versions[1].mode = S_IFDIR;
 			}
 			if (!e->tree)
 				load_tree(e);
 			if (tree_content_set(e, slash1 + 1, sha1, mode)) {
-				hashclr(root->sha1);
+				hashclr(root->versions[1].sha1);
 				return 1;
 			}
 			return 0;
@@ -952,17 +1029,19 @@ static int tree_content_set(
 		root->tree = t = grow_tree_content(t, 8);
 	e = new_tree_entry();
 	e->name = to_atom(p, n);
+	e->versions[0].mode = 0;
+	hashclr(e->versions[0].sha1);
 	t->entries[t->entry_count++] = e;
 	if (slash1) {
 		e->tree = new_tree_content(8);
-		e->mode = S_IFDIR;
+		e->versions[1].mode = S_IFDIR;
 		tree_content_set(e, slash1 + 1, sha1, mode);
 	} else {
 		e->tree = NULL;
-		e->mode = mode;
-		hashcpy(e->sha1, sha1);
+		e->versions[1].mode = mode;
+		hashcpy(e->versions[1].sha1, sha1);
 	}
-	hashclr(root->sha1);
+	hashclr(root->versions[1].sha1);
 	return 1;
 }
 
@@ -982,14 +1061,14 @@ static int tree_content_remove(struct tree_entry *root, const char *p)
 	for (i = 0; i < t->entry_count; i++) {
 		e = t->entries[i];
 		if (e->name->str_len == n && !strncmp(p, e->name->str_dat, n)) {
-			if (!slash1 || !S_ISDIR(e->mode))
+			if (!slash1 || !S_ISDIR(e->versions[1].mode))
 				goto del_entry;
 			if (!e->tree)
 				load_tree(e);
 			if (tree_content_remove(e, slash1 + 1)) {
 				if (!e->tree->entry_count)
 					goto del_entry;
-				hashclr(root->sha1);
+				hashclr(root->versions[1].sha1);
 				return 1;
 			}
 			return 0;
@@ -998,11 +1077,13 @@ static int tree_content_remove(struct tree_entry *root, const char *p)
 	return 0;
 
 del_entry:
-	for (i++; i < t->entry_count; i++)
-		t->entries[i-1] = t->entries[i];
-	t->entry_count--;
-	release_tree_entry(e);
-	hashclr(root->sha1);
+	if (e->tree) {
+		release_tree_content_recursive(e->tree);
+		e->tree = NULL;
+	}
+	e->versions[1].mode = 0;
+	hashclr(e->versions[1].sha1);
+	hashclr(root->versions[1].sha1);
 	return 1;
 }
 
@@ -1359,27 +1440,33 @@ static void cmd_from(struct branch *b)
 	if (b == s)
 		die("Can't create a branch from itself: %s", b->name);
 	else if (s) {
+		unsigned char *t = s->branch_tree.versions[1].sha1;
 		hashcpy(b->sha1, s->sha1);
-		hashcpy(b->branch_tree.sha1, s->branch_tree.sha1);
+		hashcpy(b->branch_tree.versions[0].sha1, t);
+		hashcpy(b->branch_tree.versions[1].sha1, t);
 	} else if (*from == ':') {
 		unsigned long idnum = strtoul(from + 1, NULL, 10);
 		struct object_entry *oe = find_mark(idnum);
 		unsigned long size;
+		unsigned int depth;
 		char *buf;
 		if (oe->type != OBJ_COMMIT)
 			die("Mark :%lu not a commit", idnum);
 		hashcpy(b->sha1, oe->sha1);
-		buf = unpack_entry(oe->offset, &size);
+		buf = unpack_entry(oe->offset, &size, &depth);
 		if (!buf || size < 46)
 			die("Not a valid commit: %s", from);
 		if (memcmp("tree ", buf, 5)
-			|| get_sha1_hex(buf + 5, b->branch_tree.sha1))
+			|| get_sha1_hex(buf + 5, b->branch_tree.versions[1].sha1))
 			die("The commit %s is corrupt", sha1_to_hex(b->sha1));
 		free(buf);
+		hashcpy(b->branch_tree.versions[0].sha1,
+			b->branch_tree.versions[1].sha1);
 	} else if (!get_sha1(from, b->sha1)) {
-		if (is_null_sha1(b->sha1))
-			hashclr(b->branch_tree.sha1);
-		else {
+		if (is_null_sha1(b->sha1)) {
+			hashclr(b->branch_tree.versions[0].sha1);
+			hashclr(b->branch_tree.versions[1].sha1);
+		} else {
 			unsigned long size;
 			char *buf;
 
@@ -1388,9 +1475,11 @@ static void cmd_from(struct branch *b)
 			if (!buf || size < 46)
 				die("Not a valid commit: %s", from);
 			if (memcmp("tree ", buf, 5)
-				|| get_sha1_hex(buf + 5, b->branch_tree.sha1))
+				|| get_sha1_hex(buf + 5, b->branch_tree.versions[1].sha1))
 				die("The commit %s is corrupt", sha1_to_hex(b->sha1));
 			free(buf);
+			hashcpy(b->branch_tree.versions[0].sha1,
+				b->branch_tree.versions[1].sha1);
 		}
 	} else
 		die("Invalid ref name or SHA1 expression: %s", from);
@@ -1466,7 +1555,8 @@ static void cmd_new_commit()
 			? strlen(author) + strlen(committer)
 			: 2 * strlen(committer)));
 	sp = body;
-	sp += sprintf(sp, "tree %s\n", sha1_to_hex(b->branch_tree.sha1));
+	sp += sprintf(sp, "tree %s\n",
+		sha1_to_hex(b->branch_tree.versions[1].sha1));
 	if (!is_null_sha1(b->sha1))
 		sp += sprintf(sp, "parent %s\n", sha1_to_hex(b->sha1));
 	if (author)
@@ -1722,10 +1812,10 @@ int main(int argc, const char **argv)
 	fprintf(stderr, "---------------------------------------------------\n");
 	fprintf(stderr, "Alloc'd objects: %10lu (%10lu overflow  )\n", alloc_count, alloc_count - est_obj_cnt);
 	fprintf(stderr, "Total objects:   %10lu (%10lu duplicates)\n", object_count, duplicate_count);
-	fprintf(stderr, "      blobs  :   %10lu (%10lu duplicates)\n", object_count_by_type[OBJ_BLOB], duplicate_count_by_type[OBJ_BLOB]);
-	fprintf(stderr, "      trees  :   %10lu (%10lu duplicates)\n", object_count_by_type[OBJ_TREE], duplicate_count_by_type[OBJ_TREE]);
-	fprintf(stderr, "      commits:   %10lu (%10lu duplicates)\n", object_count_by_type[OBJ_COMMIT], duplicate_count_by_type[OBJ_COMMIT]);
-	fprintf(stderr, "      tags   :   %10lu (%10lu duplicates)\n", object_count_by_type[OBJ_TAG], duplicate_count_by_type[OBJ_TAG]);
+	fprintf(stderr, "      blobs  :   %10lu (%10lu duplicates %10lu deltas)\n", object_count_by_type[OBJ_BLOB], duplicate_count_by_type[OBJ_BLOB], delta_count_by_type[OBJ_BLOB]);
+	fprintf(stderr, "      trees  :   %10lu (%10lu duplicates %10lu deltas)\n", object_count_by_type[OBJ_TREE], duplicate_count_by_type[OBJ_TREE], delta_count_by_type[OBJ_TREE]);
+	fprintf(stderr, "      commits:   %10lu (%10lu duplicates %10lu deltas)\n", object_count_by_type[OBJ_COMMIT], duplicate_count_by_type[OBJ_COMMIT], delta_count_by_type[OBJ_COMMIT]);
+	fprintf(stderr, "      tags   :   %10lu (%10lu duplicates %10lu deltas)\n", object_count_by_type[OBJ_TAG], duplicate_count_by_type[OBJ_TAG], delta_count_by_type[OBJ_TAG]);
 	fprintf(stderr, "Total branches:  %10lu (%10lu loads     )\n", branch_count, branch_load_count);
 	fprintf(stderr, "      marks:     %10u (%10lu unique    )\n", (1 << marks->shift) * 1024, marks_set_count);
 	fprintf(stderr, "      atoms:     %10u\n", atom_cnt);

From e2eb469d1ff9595882c8329ad415b1d7246769d0 Mon Sep 17 00:00:00 2001
From: "Shawn O. Pearce" <spearce@spearce.org>
Date: Mon, 28 Aug 2006 13:02:51 -0400
Subject: [PATCH 31/81] Recycle data buffers for tree generation in
 fast-import.

We only ever generate at most two tree streams at a time.  Since most
trees are around the same size we can simply recycle the buffers from
one tree generation to the next rather than constantly xmalloc'ing
and free'ing them.  This should perform slightly better when handling
a large number of trees as malloc has less work to do.

Signed-off-by: Shawn O. Pearce <spearce@spearce.org>
---
 fast-import.c | 46 ++++++++++++++++++++++++++++++++--------------
 1 file changed, 32 insertions(+), 14 deletions(-)

diff --git a/fast-import.c b/fast-import.c
index 6b01120415..8d15a05739 100644
--- a/fast-import.c
+++ b/fast-import.c
@@ -134,6 +134,7 @@ struct last_object
 	void *data;
 	unsigned long len;
 	unsigned int depth;
+	int no_free;
 	unsigned char sha1[20];
 };
 
@@ -195,6 +196,12 @@ struct tag
 	unsigned char sha1[20];
 };
 
+struct dbuf
+{
+	void *buffer;
+	size_t capacity;
+};
+
 
 /* Stats and misc. counters */
 static unsigned long max_depth = 10;
@@ -243,6 +250,8 @@ static unsigned int tree_entry_alloc = 1000;
 static void *avail_tree_entry;
 static unsigned int avail_tree_table_sz = 100;
 static struct avail_tree_content **avail_tree_table;
+static struct dbuf old_tree;
+static struct dbuf new_tree;
 
 /* Branch data */
 static unsigned long max_active_branches = 5;
@@ -680,7 +689,7 @@ static int store_object(
 	if (delta)
 		free(delta);
 	if (last) {
-		if (last->data)
+		if (last->data && !last->no_free)
 			free(last->data);
 		last->data = dat;
 		last->len = datlen;
@@ -897,11 +906,14 @@ static int tecmp1 (const void *_a, const void *_b)
 		b->name->str_dat, b->name->str_len, b->versions[1].mode);
 }
 
-static void* mktree(struct tree_content *t, int v, unsigned long *szp)
+static void mktree(struct tree_content *t,
+	int v,
+	unsigned long *szp,
+	struct dbuf *b)
 {
 	size_t maxlen = 0;
 	unsigned int i;
-	char *buf, *c;
+	char *c;
 
 	if (!v)
 		qsort(t->entries,t->entry_count,sizeof(t->entries[0]),tecmp0);
@@ -913,7 +925,16 @@ static void* mktree(struct tree_content *t, int v, unsigned long *szp)
 			maxlen += t->entries[i]->name->str_len + 34;
 	}
 
-	buf = c = xmalloc(maxlen);
+	if (b->buffer) {
+		if (b->capacity < maxlen)
+			b->capacity = ((maxlen / 1024) + 1) * 1024;
+		b->buffer = xrealloc(b->buffer, b->capacity);
+	} else {
+		b->capacity = ((maxlen / 1024) + 1) * 1024;
+		b->buffer = xmalloc(b->capacity);
+	}
+
+	c = b->buffer;
 	for (i = 0; i < t->entry_count; i++) {
 		struct tree_entry *e = t->entries[i];
 		if (!e->versions[v].mode)
@@ -925,17 +946,14 @@ static void* mktree(struct tree_content *t, int v, unsigned long *szp)
 		hashcpy((unsigned char*)c, e->versions[v].sha1);
 		c += 20;
 	}
-
-	*szp = c - buf;
-	return buf;
+	*szp = c - (char*)b->buffer;
 }
 
 static void store_tree(struct tree_entry *root)
 {
 	struct tree_content *t = root->tree;
 	unsigned int i, j, del;
-	unsigned long vers1len;
-	void **vers1dat;
+	unsigned long new_len;
 	struct last_object lo;
 
 	if (!is_null_sha1(root->versions[1].sha1))
@@ -951,16 +969,16 @@ static void store_tree(struct tree_entry *root)
 		lo.data = NULL;
 		lo.depth = 0;
 	} else {
-		lo.data = mktree(t, 0, &lo.len);
+		mktree(t, 0, &lo.len, &old_tree);
+		lo.data = old_tree.buffer;
 		lo.depth = t->delta_depth;
+		lo.no_free = 1;
 		hashcpy(lo.sha1, root->versions[0].sha1);
 	}
-	vers1dat = mktree(t, 1, &vers1len);
+	mktree(t, 1, &new_len, &new_tree);
 
-	store_object(OBJ_TREE, vers1dat, vers1len,
+	store_object(OBJ_TREE, new_tree.buffer, new_len,
 		&lo, root->versions[1].sha1, 0);
-	/* note: lo.dat (if created) was freed by store_object */
-	free(vers1dat);
 
 	t->delta_depth = lo.depth;
 	hashcpy(root->versions[0].sha1, root->versions[1].sha1);

From 243f801d1d08753cd4eff2a23e245f7575c37ad5 Mon Sep 17 00:00:00 2001
From: "Shawn O. Pearce" <spearce@spearce.org>
Date: Mon, 28 Aug 2006 13:15:48 -0400
Subject: [PATCH 32/81] Reuse the same buffer for all commits/tags in
 fast-import.

Since most commits and tag objects are around the same size and we
only generate one at a time we can reuse the same buffer rather than
xmalloc'ing and free'ing the buffer every time we generate a commit.

Signed-off-by: Shawn O. Pearce <spearce@spearce.org>
---
 fast-import.c | 43 +++++++++++++++++++++++--------------------
 1 file changed, 23 insertions(+), 20 deletions(-)

diff --git a/fast-import.c b/fast-import.c
index 8d15a05739..3d99102005 100644
--- a/fast-import.c
+++ b/fast-import.c
@@ -267,6 +267,7 @@ static struct tag *last_tag;
 /* Input stream parsing */
 static struct strbuf command_buf;
 static unsigned long next_mark;
+static struct dbuf new_data;
 static FILE* branch_log;
 
 
@@ -381,6 +382,17 @@ static char* pool_strdup(const char *s)
 	return r;
 }
 
+static void size_dbuf(struct dbuf *b, size_t maxlen)
+{
+	if (b->buffer) {
+		if (b->capacity >= maxlen)
+			return;
+		free(b->buffer);
+	}
+	b->capacity = ((maxlen / 1024) + 1) * 1024;
+	b->buffer = xmalloc(b->capacity);
+}
+
 static void insert_mark(unsigned long idnum, struct object_entry *oe)
 {
 	struct mark_set *s = marks;
@@ -925,15 +937,7 @@ static void mktree(struct tree_content *t,
 			maxlen += t->entries[i]->name->str_len + 34;
 	}
 
-	if (b->buffer) {
-		if (b->capacity < maxlen)
-			b->capacity = ((maxlen / 1024) + 1) * 1024;
-		b->buffer = xrealloc(b->buffer, b->capacity);
-	} else {
-		b->capacity = ((maxlen / 1024) + 1) * 1024;
-		b->buffer = xmalloc(b->capacity);
-	}
-
+	size_dbuf(b, maxlen);
 	c = b->buffer;
 	for (i = 0; i < t->entry_count; i++) {
 		struct tree_entry *e = t->entries[i];
@@ -1515,7 +1519,6 @@ static void cmd_new_commit()
 	char *sp;
 	char *author = NULL;
 	char *committer = NULL;
-	char *body;
 
 	/* Obtain the branch name from the rest of our command */
 	sp = strchr(command_buf.buf, ' ') + 1;
@@ -1568,11 +1571,11 @@ static void cmd_new_commit()
 
 	/* build the tree and the commit */
 	store_tree(&b->branch_tree);
-	body = xmalloc(97 + msglen
+	size_dbuf(&new_data, 97 + msglen
 		+ (author
 			? strlen(author) + strlen(committer)
 			: 2 * strlen(committer)));
-	sp = body;
+	sp = new_data.buffer;
 	sp += sprintf(sp, "tree %s\n",
 		sha1_to_hex(b->branch_tree.versions[1].sha1));
 	if (!is_null_sha1(b->sha1))
@@ -1589,8 +1592,9 @@ static void cmd_new_commit()
 	free(committer);
 	free(msg);
 
-	store_object(OBJ_COMMIT, body, sp - body, NULL, b->sha1, next_mark);
-	free(body);
+	store_object(OBJ_COMMIT,
+		new_data.buffer, sp - (char*)new_data.buffer,
+		NULL, b->sha1, next_mark);
 	b->last_commit = object_count_by_type[OBJ_COMMIT];
 
 	if (branch_log) {
@@ -1616,7 +1620,6 @@ static void cmd_new_tag()
 	struct branch *s;
 	void *msg;
 	size_t msglen;
-	char *body;
 	struct tag *t;
 	unsigned long from_mark = 0;
 	unsigned char sha1[20];
@@ -1688,8 +1691,8 @@ static void cmd_new_tag()
 	msg = cmd_data(&msglen);
 
 	/* build the tag object */
-	body = xmalloc(67 + strlen(t->name) + strlen(tagger) + msglen);
-	sp = body;
+	size_dbuf(&new_data, 67+strlen(t->name)+strlen(tagger)+msglen);
+	sp = new_data.buffer;
 	sp += sprintf(sp, "object %s\n", sha1_to_hex(sha1));
 	sp += sprintf(sp, "type %s\n", type_names[OBJ_COMMIT]);
 	sp += sprintf(sp, "tag %s\n", t->name);
@@ -1699,8 +1702,8 @@ static void cmd_new_tag()
 	free(tagger);
 	free(msg);
 
-	store_object(OBJ_TAG, body, sp - body, NULL, t->sha1, 0);
-	free(body);
+	store_object(OBJ_TAG, new_data.buffer, sp - (char*)new_data.buffer,
+		NULL, t->sha1, 0);
 
 	if (branch_log) {
 		int need_dq = quote_c_style(t->name, NULL, NULL, 0);
@@ -1749,7 +1752,7 @@ int main(int argc, const char **argv)
 {
 	const char *base_name;
 	int i;
-	unsigned long est_obj_cnt = 1000;
+	unsigned long est_obj_cnt = object_entry_alloc;
 	char *pack_name;
 	char *idx_name;
 	struct stat sb;

From 23bc886c966b4362555b61f33c6eef71552e4d0e Mon Sep 17 00:00:00 2001
From: "Shawn O. Pearce" <spearce@spearce.org>
Date: Mon, 28 Aug 2006 13:54:01 -0400
Subject: [PATCH 33/81] Replace ywrite in fast-import with the standard
 write_or_die.

Signed-off-by: Shawn O. Pearce <spearce@spearce.org>
---
 fast-import.c | 27 +++++++--------------------
 1 file changed, 7 insertions(+), 20 deletions(-)

diff --git a/fast-import.c b/fast-import.c
index 3d99102005..f94f307ee6 100644
--- a/fast-import.c
+++ b/fast-import.c
@@ -586,19 +586,6 @@ static void yread(int fd, void *buffer, size_t length)
 	}
 }
 
-static void ywrite(int fd, void *buffer, size_t length)
-{
-	ssize_t ret = 0;
-	while (ret < length) {
-		ssize_t size = xwrite(fd, (char *) buffer + ret, length - ret);
-		if (!size)
-			die("Write to descriptor %i: end of file", fd);
-		if (size < 0)
-			die("Write to descriptor %i: %s", fd, strerror(errno));
-		ret += size;
-	}
-}
-
 static size_t encode_header(
 	enum object_type type,
 	size_t size,
@@ -675,8 +662,8 @@ static int store_object(
 		s.next_in = delta;
 		s.avail_in = deltalen;
 		hdrlen = encode_header(OBJ_DELTA, deltalen, hdr);
-		ywrite(pack_fd, hdr, hdrlen);
-		ywrite(pack_fd, last->sha1, sizeof(sha1));
+		write_or_die(pack_fd, hdr, hdrlen);
+		write_or_die(pack_fd, last->sha1, sizeof(sha1));
 		pack_size += hdrlen + sizeof(sha1);
 	} else {
 		if (last)
@@ -684,7 +671,7 @@ static int store_object(
 		s.next_in = dat;
 		s.avail_in = datlen;
 		hdrlen = encode_header(type, datlen, hdr);
-		ywrite(pack_fd, hdr, hdrlen);
+		write_or_die(pack_fd, hdr, hdrlen);
 		pack_size += hdrlen;
 	}
 
@@ -694,7 +681,7 @@ static int store_object(
 		/* nothing */;
 	deflateEnd(&s);
 
-	ywrite(pack_fd, out, s.total_out);
+	write_or_die(pack_fd, out, s.total_out);
 	pack_size += s.total_out;
 
 	free(out);
@@ -1117,7 +1104,7 @@ static void init_pack_header()
 	hdr.hdr_version = htonl(2);
 	hdr.hdr_entries = 0;
 
-	ywrite(pack_fd, &hdr, sizeof(hdr));
+	write_or_die(pack_fd, &hdr, sizeof(hdr));
 	pack_size = sizeof(hdr);
 }
 
@@ -1138,7 +1125,7 @@ static void fixup_header_footer()
 
 	cnt = htonl(object_count);
 	SHA1_Update(&c, &cnt, 4);
-	ywrite(pack_fd, &cnt, 4);
+	write_or_die(pack_fd, &cnt, 4);
 
 	buf = xmalloc(128 * 1024);
 	for (;;) {
@@ -1150,7 +1137,7 @@ static void fixup_header_footer()
 	free(buf);
 
 	SHA1_Final(pack_sha1, &c);
-	ywrite(pack_fd, pack_sha1, sizeof(pack_sha1));
+	write_or_die(pack_fd, pack_sha1, sizeof(pack_sha1));
 }
 
 static int oecmp (const void *_a, const void *_b)

From b54d6422b1a277ee905819e01020f5690196a999 Mon Sep 17 00:00:00 2001
From: "Shawn O. Pearce" <spearce@spearce.org>
Date: Mon, 28 Aug 2006 21:43:04 -0400
Subject: [PATCH 34/81] Correct tree corruption problems in fast-import.

The new tree delta implementation caused blob SHA1s to be used
instead of a tree SHA1 when a tree was written out.  This really
only appeared to happen when converting an existing file to a tree,
but may have been possible in some other situations.

Signed-off-by: Shawn O. Pearce <spearce@spearce.org>
---
 fast-import.c | 16 +++++++++++-----
 1 file changed, 11 insertions(+), 5 deletions(-)

diff --git a/fast-import.c b/fast-import.c
index f94f307ee6..34ff946fa3 100644
--- a/fast-import.c
+++ b/fast-import.c
@@ -956,7 +956,8 @@ static void store_tree(struct tree_entry *root)
 	}
 
 	if (is_null_sha1(root->versions[0].sha1)
-			|| !find_object(root->versions[0].sha1)) {
+			|| !find_object(root->versions[0].sha1)
+			|| !S_ISDIR(root->versions[0].mode)) {
 		lo.data = NULL;
 		lo.depth = 0;
 	} else {
@@ -1023,6 +1024,7 @@ static int tree_content_set(
 			if (!S_ISDIR(e->versions[1].mode)) {
 				e->tree = new_tree_content(8);
 				e->versions[1].mode = S_IFDIR;
+				hashclr(e->versions[1].sha1);
 			}
 			if (!e->tree)
 				load_tree(e);
@@ -1044,6 +1046,7 @@ static int tree_content_set(
 	if (slash1) {
 		e->tree = new_tree_content(8);
 		e->versions[1].mode = S_IFDIR;
+		hashclr(e->versions[1].sha1);
 		tree_content_set(e, slash1 + 1, sha1, mode);
 	} else {
 		e->tree = NULL;
@@ -1075,10 +1078,13 @@ static int tree_content_remove(struct tree_entry *root, const char *p)
 			if (!e->tree)
 				load_tree(e);
 			if (tree_content_remove(e, slash1 + 1)) {
-				if (!e->tree->entry_count)
-					goto del_entry;
-				hashclr(root->versions[1].sha1);
-				return 1;
+				for (n = 0; n < e->tree->entry_count; n++) {
+					if (e->tree->entries[n]->versions[1].mode) {
+						hashclr(root->versions[1].sha1);
+						return 1;
+					}
+				}
+				goto del_entry;
 			}
 			return 0;
 		}

From 8a8c55ea709d26ca397d6588e85579339885f507 Mon Sep 17 00:00:00 2001
From: "Shawn O. Pearce" <spearce@spearce.org>
Date: Mon, 28 Aug 2006 22:06:13 -0400
Subject: [PATCH 35/81] Additional fast-import tree delta corruption cleanups.

Signed-off-by: Shawn O. Pearce <spearce@spearce.org>
---
 fast-import.c | 22 +++++++++++-----------
 1 file changed, 11 insertions(+), 11 deletions(-)

diff --git a/fast-import.c b/fast-import.c
index 34ff946fa3..e35a89f6cd 100644
--- a/fast-import.c
+++ b/fast-import.c
@@ -478,6 +478,8 @@ static struct branch* new_branch(const char *name)
 	b = pool_calloc(1, sizeof(struct branch));
 	b->name = pool_strdup(name);
 	b->table_next_branch = branch_table[hc];
+	b->branch_tree.versions[0].mode = S_IFDIR;
+	b->branch_tree.versions[1].mode = S_IFDIR;
 	branch_table[hc] = b;
 	branch_count++;
 	return b;
@@ -955,9 +957,9 @@ static void store_tree(struct tree_entry *root)
 			store_tree(t->entries[i]);
 	}
 
-	if (is_null_sha1(root->versions[0].sha1)
-			|| !find_object(root->versions[0].sha1)
-			|| !S_ISDIR(root->versions[0].mode)) {
+	if (!S_ISDIR(root->versions[0].mode)
+			|| is_null_sha1(root->versions[0].sha1)
+			|| !find_object(root->versions[0].sha1)) {
 		lo.data = NULL;
 		lo.depth = 0;
 	} else {
@@ -967,13 +969,12 @@ static void store_tree(struct tree_entry *root)
 		lo.no_free = 1;
 		hashcpy(lo.sha1, root->versions[0].sha1);
 	}
-	mktree(t, 1, &new_len, &new_tree);
 
+	mktree(t, 1, &new_len, &new_tree);
 	store_object(OBJ_TREE, new_tree.buffer, new_len,
 		&lo, root->versions[1].sha1, 0);
 
 	t->delta_depth = lo.depth;
-	hashcpy(root->versions[0].sha1, root->versions[1].sha1);
 	for (i = 0, j = 0, del = 0; i < t->entry_count; i++) {
 		struct tree_entry *e = t->entries[i];
 		if (e->versions[1].mode) {
@@ -1024,7 +1025,6 @@ static int tree_content_set(
 			if (!S_ISDIR(e->versions[1].mode)) {
 				e->tree = new_tree_content(8);
 				e->versions[1].mode = S_IFDIR;
-				hashclr(e->versions[1].sha1);
 			}
 			if (!e->tree)
 				load_tree(e);
@@ -1046,7 +1046,6 @@ static int tree_content_set(
 	if (slash1) {
 		e->tree = new_tree_content(8);
 		e->versions[1].mode = S_IFDIR;
-		hashclr(e->versions[1].sha1);
 		tree_content_set(e, slash1 + 1, sha1, mode);
 	} else {
 		e->tree = NULL;
@@ -1564,6 +1563,8 @@ static void cmd_new_commit()
 
 	/* build the tree and the commit */
 	store_tree(&b->branch_tree);
+	hashcpy(b->branch_tree.versions[0].sha1,
+		b->branch_tree.versions[1].sha1);
 	size_dbuf(&new_data, 97 + msglen
 		+ (author
 			? strlen(author) + strlen(committer)
@@ -1823,9 +1824,9 @@ int main(int argc, const char **argv)
 		fclose(branch_log);
 
 	fprintf(stderr, "%s statistics:\n", argv[0]);
-	fprintf(stderr, "---------------------------------------------------\n");
+	fprintf(stderr, "---------------------------------------------------------------------\n");
 	fprintf(stderr, "Alloc'd objects: %10lu (%10lu overflow  )\n", alloc_count, alloc_count - est_obj_cnt);
-	fprintf(stderr, "Total objects:   %10lu (%10lu duplicates)\n", object_count, duplicate_count);
+	fprintf(stderr, "Total objects:   %10lu (%10lu duplicates                  )\n", object_count, duplicate_count);
 	fprintf(stderr, "      blobs  :   %10lu (%10lu duplicates %10lu deltas)\n", object_count_by_type[OBJ_BLOB], duplicate_count_by_type[OBJ_BLOB], delta_count_by_type[OBJ_BLOB]);
 	fprintf(stderr, "      trees  :   %10lu (%10lu duplicates %10lu deltas)\n", object_count_by_type[OBJ_TREE], duplicate_count_by_type[OBJ_TREE], delta_count_by_type[OBJ_TREE]);
 	fprintf(stderr, "      commits:   %10lu (%10lu duplicates %10lu deltas)\n", object_count_by_type[OBJ_COMMIT], duplicate_count_by_type[OBJ_COMMIT], delta_count_by_type[OBJ_COMMIT]);
@@ -1837,12 +1838,11 @@ int main(int argc, const char **argv)
 	fprintf(stderr, "       pools:    %10lu KiB\n", total_allocd/1024);
 	fprintf(stderr, "     objects:    %10lu KiB\n", (alloc_count*sizeof(struct object_entry))/1024);
 	fprintf(stderr, "Pack remaps:     %10lu\n", remap_count);
-	fprintf(stderr, "---------------------------------------------------\n");
-
 	stat(pack_name, &sb);
 	fprintf(stderr, "Pack size:       %10lu KiB\n", (unsigned long)(sb.st_size/1024));
 	stat(idx_name, &sb);
 	fprintf(stderr, "Index size:      %10lu KiB\n", (unsigned long)(sb.st_size/1024));
+	fprintf(stderr, "---------------------------------------------------------------------\n");
 
 	fprintf(stderr, "\n");
 

From cacbdd0afb481a6f3019e5e7db98f88e40941fd5 Mon Sep 17 00:00:00 2001
From: "Shawn O. Pearce" <spearce@spearce.org>
Date: Thu, 11 Jan 2007 21:25:01 -0500
Subject: [PATCH 36/81] Fix repository corruption when using marks for modified
 blobs.

Apparently we did not copy the blob SHA1 into the stack variable
'sha1' when a mark is used to refer to a prior blob.  This code
was not previously tested as the Mozilla CVS -> git-fast-import
program always fed us full SHA1s for modified blobs and did not
use the mark feature there.

Signed-off-by: Shawn O. Pearce <spearce@spearce.org>
---
 fast-import.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/fast-import.c b/fast-import.c
index e35a89f6cd..e9a46c6c3a 100644
--- a/fast-import.c
+++ b/fast-import.c
@@ -1378,6 +1378,7 @@ static void file_change_m(struct branch *b)
 	if (*p == ':') {
 		char *x;
 		oe = find_mark(strtoul(p + 1, &x, 10));
+		hashcpy(sha1, oe->sha1);
 		p = x;
 	} else {
 		if (get_sha1_hex(p, sha1))

From 62b6f48388faf0ac2432a07cfc53aa904c591f8f Mon Sep 17 00:00:00 2001
From: "Shawn O. Pearce" <spearce@spearce.org>
Date: Thu, 11 Jan 2007 22:21:38 -0500
Subject: [PATCH 37/81] Support creation of merge commits in fast-import.

Some importers are able to determine when branch merges occurred
within their source data.  In these cases they will want to supply
the correct commits to fast-import so that a proper merge commit
will exist in Git.  This is now supported by supplying a 'merge '
command after the commit message and optional from command.

A merge is not actually performed by fast-import, its assumed that
the frontend performed any sort of merging activity already and
that fast-import should simply be storing its result.

Signed-off-by: Shawn O. Pearce <spearce@spearce.org>
---
 fast-import.c | 58 +++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 58 insertions(+)

diff --git a/fast-import.c b/fast-import.c
index e9a46c6c3a..15db4b39d1 100644
--- a/fast-import.c
+++ b/fast-import.c
@@ -20,6 +20,7 @@ Format of STDIN stream:
     'committer' sp name '<' email '>' ts tz lf
     commit_msg
     ('from' sp (ref_str | hexsha1 | sha1exp_str | idnum) lf)?
+    ('merge' sp (ref_str | hexsha1 | sha1exp_str | idnum) lf)*
     file_change*
     lf;
   commit_msg ::= data;
@@ -202,6 +203,11 @@ struct dbuf
 	size_t capacity;
 };
 
+struct hash_list
+{
+	struct hash_list *next;
+	unsigned char sha1[20];
+};
 
 /* Stats and misc. counters */
 static unsigned long max_depth = 10;
@@ -1502,6 +1508,48 @@ static void cmd_from(struct branch *b)
 	read_next_command();
 }
 
+static struct hash_list* cmd_merge(unsigned int *count)
+{
+	struct hash_list *list = NULL, *n, *e;
+	const char *from, *endp;
+	char *str_uq;
+	struct branch *s;
+
+	*count = 0;
+	while (!strncmp("merge ", command_buf.buf, 6)) {
+		from = strchr(command_buf.buf, ' ') + 1;
+		str_uq = unquote_c_style(from, &endp);
+		if (str_uq) {
+			if (*endp)
+				die("Garbage after string in: %s", command_buf.buf);
+			from = str_uq;
+		}
+
+		n = xmalloc(sizeof(*n));
+		s = lookup_branch(from);
+		if (s)
+			hashcpy(n->sha1, s->sha1);
+		else if (*from == ':') {
+			unsigned long idnum = strtoul(from + 1, NULL, 10);
+			struct object_entry *oe = find_mark(idnum);
+			if (oe->type != OBJ_COMMIT)
+				die("Mark :%lu not a commit", idnum);
+			hashcpy(n->sha1, oe->sha1);
+		} else if (get_sha1(from, n->sha1))
+			die("Invalid ref name or SHA1 expression: %s", from);
+
+		n->next = NULL;
+		if (list)
+			e->next = n;
+		else
+			list = n;
+		e = n;
+		*count++;
+		read_next_command();
+	}
+	return list;
+}
+
 static void cmd_new_commit()
 {
 	struct branch *b;
@@ -1512,6 +1560,8 @@ static void cmd_new_commit()
 	char *sp;
 	char *author = NULL;
 	char *committer = NULL;
+	struct hash_list *merge_list = NULL;
+	unsigned int merge_count;
 
 	/* Obtain the branch name from the rest of our command */
 	sp = strchr(command_buf.buf, ' ') + 1;
@@ -1542,6 +1592,7 @@ static void cmd_new_commit()
 	msg = cmd_data(&msglen);
 	read_next_command();
 	cmd_from(b);
+	merge_list = cmd_merge(&merge_count);
 
 	/* ensure the branch is active/loaded */
 	if (!b->branch_tree.tree || !max_active_branches) {
@@ -1567,6 +1618,7 @@ static void cmd_new_commit()
 	hashcpy(b->branch_tree.versions[0].sha1,
 		b->branch_tree.versions[1].sha1);
 	size_dbuf(&new_data, 97 + msglen
+		+ merge_count * 49
 		+ (author
 			? strlen(author) + strlen(committer)
 			: 2 * strlen(committer)));
@@ -1575,6 +1627,12 @@ static void cmd_new_commit()
 		sha1_to_hex(b->branch_tree.versions[1].sha1));
 	if (!is_null_sha1(b->sha1))
 		sp += sprintf(sp, "parent %s\n", sha1_to_hex(b->sha1));
+	while (merge_list) {
+		struct hash_list *next = merge_list->next;
+		sp += sprintf(sp, "parent %s\n", sha1_to_hex(merge_list->sha1));
+		free(merge_list);
+		merge_list = next;
+	}
 	if (author)
 		sp += sprintf(sp, "%s\n", author);
 	else

From 9938ffc53a15c755bbd3894c02492b940ea34c4c Mon Sep 17 00:00:00 2001
From: "Shawn O. Pearce" <spearce@spearce.org>
Date: Thu, 11 Jan 2007 22:28:39 -0500
Subject: [PATCH 38/81] Allow creating branches without committing in
 fast-import.

Some importers may want to create a branch long before they actually
commit to it, or in some cases they may never commit to the branch
but they still need the ref to be created in the repository after
the import is complete.

This extends the 'reset ' command to automatically create a new
branch if the supplied reference isn't already known as a branch.

While I'm at it I also modified the syntax of the reset command
to terminate with an empty line, like commit and tag operate.
This just makes the command set more consistent.

Signed-off-by: Shawn O. Pearce <spearce@spearce.org>
---
 fast-import.c | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/fast-import.c b/fast-import.c
index 15db4b39d1..38e24bf6a6 100644
--- a/fast-import.c
+++ b/fast-import.c
@@ -36,7 +36,9 @@ Format of STDIN stream:
     tag_msg;
   tag_msg ::= data;
 
-  reset_branch ::= 'reset' sp ref_str lf;
+  reset_branch ::= 'reset' sp ref_str lf
+    ('from' sp (ref_str | hexsha1 | sha1exp_str | idnum) lf)?
+    lf;
 
      # note: the first idnum in a stream should be 1 and subsequent
      # idnums should not have gaps between values as this will cause
@@ -1794,8 +1796,12 @@ static void cmd_reset_branch()
 			b->branch_tree.tree = NULL;
 		}
 	}
+	else
+		b = new_branch(sp);
 	if (str_uq)
 		free(str_uq);
+	read_next_command();
+	cmd_from(b);
 }
 
 static const char fast_import_usage[] =

From d489bc14919cdd37d3978065591199d21d6719f8 Mon Sep 17 00:00:00 2001
From: "Shawn O. Pearce" <spearce@spearce.org>
Date: Sun, 14 Jan 2007 06:20:23 -0500
Subject: [PATCH 39/81] Improve reuse of sha1_file library within fast-import.

Now that the sha1_file.c library routines use the sliding mmap
routines to perform efficient access to portions of a packfile
I can remove that code from fast-import.c and just invoke it.
One benefit is we now have reloading support for any packfile which
uses OBJ_OFS_DELTA.  Another is we have significantly less code
to maintain.

This code reuse change *requires* that fast-import generate only
an OBJ_OFS_DELTA format packfile, as there is absolutely no index
available to perform OBJ_REF_DELTA lookup in while unpacking
an object.  This is probably reasonable to require as the delta
offsets result in smaller packfiles and are faster to unpack,
as no index searching is required.  Its also only a temporary
requirement as users could always repack without offsets before
making the import available to older versions of Git.

Signed-off-by: Shawn O. Pearce <spearce@spearce.org>
---
 fast-import.c | 175 +++++++++-----------------------------------------
 1 file changed, 31 insertions(+), 144 deletions(-)

diff --git a/fast-import.c b/fast-import.c
index 492a8594bf..f0f51a6899 100644
--- a/fast-import.c
+++ b/fast-import.c
@@ -136,9 +136,9 @@ struct last_object
 {
 	void *data;
 	unsigned long len;
+	unsigned long offset;
 	unsigned int depth;
-	int no_free;
-	unsigned char sha1[20];
+	unsigned no_free:1;
 };
 
 struct mem_pool
@@ -235,13 +235,10 @@ static unsigned int atom_cnt;
 static struct atom_str **atom_table;
 
 /* The .pack file being generated */
+static struct packed_git *pack_data;
 static int pack_fd;
 static unsigned long pack_size;
 static unsigned char pack_sha1[20];
-static unsigned char* pack_base;
-static unsigned long pack_moff;
-static unsigned long pack_mlen = 128*1024*1024;
-static unsigned long page_size;
 
 /* Table of objects we've written. */
 static unsigned int object_entry_alloc = 5000;
@@ -667,14 +664,23 @@ static int store_object(
 	deflateInit(&s, zlib_compression_level);
 
 	if (delta) {
+		unsigned long ofs = e->offset - last->offset;
+		unsigned pos = sizeof(hdr) - 1;
+
 		delta_count_by_type[type]++;
 		last->depth++;
 		s.next_in = delta;
 		s.avail_in = deltalen;
-		hdrlen = encode_header(OBJ_REF_DELTA, deltalen, hdr);
+
+		hdrlen = encode_header(OBJ_OFS_DELTA, deltalen, hdr);
 		write_or_die(pack_fd, hdr, hdrlen);
-		write_or_die(pack_fd, last->sha1, sizeof(sha1));
-		pack_size += hdrlen + sizeof(sha1);
+		pack_size += hdrlen;
+
+		hdr[pos] = ofs & 127;
+		while (ofs >>= 7)
+			hdr[--pos] = 128 | (--ofs & 127);
+		write_or_die(pack_fd, hdr + pos, sizeof(hdr) - pos);
+		pack_size += sizeof(hdr) - pos;
 	} else {
 		if (last)
 			last->depth = 0;
@@ -701,139 +707,17 @@ static int store_object(
 		if (last->data && !last->no_free)
 			free(last->data);
 		last->data = dat;
+		last->offset = e->offset;
 		last->len = datlen;
-		hashcpy(last->sha1, sha1);
 	}
 	return 0;
 }
 
-static unsigned char* map_pack(unsigned long offset, unsigned int *left)
+static void *gfi_unpack_entry(unsigned long ofs, unsigned long *sizep)
 {
-	if (offset >= pack_size)
-		die("object offset outside of pack file");
-	if (!pack_base
-			|| offset < pack_moff
-			|| (offset + 20) >= (pack_moff + pack_mlen)) {
-		if (pack_base)
-			munmap(pack_base, pack_mlen);
-		pack_moff = (offset / page_size) * page_size;
-		pack_base = mmap(NULL,pack_mlen,PROT_READ,MAP_SHARED,
-			pack_fd,pack_moff);
-		if (pack_base == MAP_FAILED)
-			die("Failed to map generated pack: %s", strerror(errno));
-		remap_count++;
-	}
-	offset -= pack_moff;
-	if (left)
-		*left = pack_mlen - offset;
-	return pack_base + offset;
-}
-
-static unsigned long unpack_object_header(unsigned long offset,
-	enum object_type *type,
-	unsigned long *sizep)
-{
-	unsigned shift;
-	unsigned char c;
-	unsigned long size;
-
-	c = *map_pack(offset++, NULL);
-	*type = (c >> 4) & 7;
-	size = c & 15;
-	shift = 4;
-	while (c & 0x80) {
-		c = *map_pack(offset++, NULL);
-		size += (c & 0x7f) << shift;
-		shift += 7;
-	}
-	*sizep = size;
-	return offset;
-}
-
-static void *unpack_non_delta_entry(unsigned long o, unsigned long sz)
-{
-	z_stream stream;
-	unsigned char *result;
-
-	result = xmalloc(sz + 1);
-	result[sz] = 0;
-
-	memset(&stream, 0, sizeof(stream));
-	stream.next_in = map_pack(o, &stream.avail_in);
-	stream.next_out = result;
-	stream.avail_out = sz;
-
-	inflateInit(&stream);
-	for (;;) {
-		int st = inflate(&stream, Z_FINISH);
-		if (st == Z_STREAM_END)
-			break;
-		if (st == Z_OK || st == Z_BUF_ERROR) {
-			o = stream.next_in - pack_base + pack_moff;
-			stream.next_in = map_pack(o, &stream.avail_in);
-			continue;
-		}
-		die("Error %i from zlib during inflate.", st);
-	}
-	inflateEnd(&stream);
-	if (stream.total_out != sz)
-		die("Error after inflate: sizes mismatch");
-	return result;
-}
-
-static void *gfi_unpack_entry(unsigned long offset,
-	unsigned long *sizep,
-	unsigned int *delta_depth);
-
-static void *unpack_delta_entry(unsigned long offset,
-	unsigned long delta_size,
-	unsigned long *sizep,
-	unsigned int *delta_depth)
-{
-	struct object_entry *base_oe;
-	unsigned char *base_sha1;
-	void *delta_data, *base, *result;
-	unsigned long base_size, result_size;
-
-	base_sha1 = map_pack(offset, NULL);
-	base_oe = find_object(base_sha1);
-	if (!base_oe)
-		die("I'm broken; I can't find a base I know must be here.");
-	base = gfi_unpack_entry(base_oe->offset, &base_size, delta_depth);
-	delta_data = unpack_non_delta_entry(offset + 20, delta_size);
-	result = patch_delta(base, base_size,
-			     delta_data, delta_size,
-			     &result_size);
-	if (!result)
-		die("failed to apply delta");
-	free(delta_data);
-	free(base);
-	*sizep = result_size;
-	(*delta_depth)++;
-	return result;
-}
-
-static void *gfi_unpack_entry(unsigned long offset,
-	unsigned long *sizep,
-	unsigned int *delta_depth)
-{
-	unsigned long size;
-	enum object_type kind;
-
-	offset = unpack_object_header(offset, &kind, &size);
-	switch (kind) {
-	case OBJ_REF_DELTA:
-		return unpack_delta_entry(offset, size, sizep, delta_depth);
-	case OBJ_COMMIT:
-	case OBJ_TREE:
-	case OBJ_BLOB:
-	case OBJ_TAG:
-		*sizep = size;
-		*delta_depth = 0;
-		return unpack_non_delta_entry(offset, size);
-	default:
-		die("I created an object I can't read!");
-	}
+	char type[20];
+	pack_data->pack_size = pack_size + 20;
+	return unpack_entry(pack_data, ofs, type, sizep);
 }
 
 static const char *get_mode(const char *str, unsigned int *modep)
@@ -867,7 +751,8 @@ static void load_tree(struct tree_entry *root)
 	if (myoe) {
 		if (myoe->type != OBJ_TREE)
 			die("Not a tree: %s", sha1_to_hex(sha1));
-		buf = gfi_unpack_entry(myoe->offset, &size, &t->delta_depth);
+		t->delta_depth = 0;
+		buf = gfi_unpack_entry(myoe->offset, &size);
 	} else {
 		char type[20];
 		buf = read_sha1_file(sha1, type, &size);
@@ -956,6 +841,7 @@ static void store_tree(struct tree_entry *root)
 	unsigned int i, j, del;
 	unsigned long new_len;
 	struct last_object lo;
+	struct object_entry *le;
 
 	if (!is_null_sha1(root->versions[1].sha1))
 		return;
@@ -965,17 +851,16 @@ static void store_tree(struct tree_entry *root)
 			store_tree(t->entries[i]);
 	}
 
-	if (!S_ISDIR(root->versions[0].mode)
-			|| is_null_sha1(root->versions[0].sha1)
-			|| !find_object(root->versions[0].sha1)) {
+	le = find_object(root->versions[0].sha1);
+	if (!S_ISDIR(root->versions[0].mode) || !le) {
 		lo.data = NULL;
 		lo.depth = 0;
 	} else {
 		mktree(t, 0, &lo.len, &old_tree);
 		lo.data = old_tree.buffer;
+		lo.offset = le->offset;
 		lo.depth = t->delta_depth;
 		lo.no_free = 1;
-		hashcpy(lo.sha1, root->versions[0].sha1);
 	}
 
 	mktree(t, 1, &new_len, &new_tree);
@@ -1471,12 +1356,11 @@ static void cmd_from(struct branch *b)
 		unsigned long idnum = strtoul(from + 1, NULL, 10);
 		struct object_entry *oe = find_mark(idnum);
 		unsigned long size;
-		unsigned int depth;
 		char *buf;
 		if (oe->type != OBJ_COMMIT)
 			die("Mark :%lu not a commit", idnum);
 		hashcpy(b->sha1, oe->sha1);
-		buf = gfi_unpack_entry(oe->offset, &size, &depth);
+		buf = gfi_unpack_entry(oe->offset, &size);
 		if (!buf || size < 46)
 			die("Not a valid commit: %s", from);
 		if (memcmp("tree ", buf, 5)
@@ -1818,7 +1702,6 @@ int main(int argc, const char **argv)
 
 	setup_ident();
 	git_config(git_default_config);
-	page_size = getpagesize();
 
 	for (i = 1; i < argc; i++) {
 		const char *a = argv[i];
@@ -1854,6 +1737,10 @@ int main(int argc, const char **argv)
 	if (pack_fd < 0)
 		die("Can't create %s: %s", pack_name, strerror(errno));
 
+	pack_data = xcalloc(1, sizeof(*pack_data) + strlen(pack_name) + 2);
+	strcpy(pack_data->pack_name, pack_name);
+	pack_data->pack_fd = pack_fd;
+
 	init_pack_header();
 	alloc_objects(est_obj_cnt);
 	strbuf_init(&command_buf);

From 03842d8e24face522fa0ca846283da33e747e4f0 Mon Sep 17 00:00:00 2001
From: "Shawn O. Pearce" <spearce@spearce.org>
Date: Mon, 15 Jan 2007 00:16:23 -0500
Subject: [PATCH 40/81] Misc. type cleanups within fast-import.

Signed-off-by: Shawn O. Pearce <spearce@spearce.org>
---
 fast-import.c | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/fast-import.c b/fast-import.c
index f0f51a6899..3a98cb848f 100644
--- a/fast-import.c
+++ b/fast-import.c
@@ -110,8 +110,8 @@ Format of STDIN stream:
 struct object_entry
 {
 	struct object_entry *next;
-	enum object_type type;
 	unsigned long offset;
+	unsigned type : TYPE_BITS;
 	unsigned char sha1[20];
 };
 
@@ -220,9 +220,9 @@ static unsigned long remap_count;
 static unsigned long object_count;
 static unsigned long duplicate_count;
 static unsigned long marks_set_count;
-static unsigned long object_count_by_type[9];
-static unsigned long duplicate_count_by_type[9];
-static unsigned long delta_count_by_type[9];
+static unsigned long object_count_by_type[1 << TYPE_BITS];
+static unsigned long duplicate_count_by_type[1 << TYPE_BITS];
+static unsigned long delta_count_by_type[1 << TYPE_BITS];
 
 /* Memory pools */
 static size_t mem_pool_alloc = 2*1024*1024 - sizeof(struct mem_pool);
@@ -276,7 +276,7 @@ static struct dbuf new_data;
 static FILE* branch_log;
 
 
-static void alloc_objects(int cnt)
+static void alloc_objects(unsigned int cnt)
 {
 	struct object_entry_pool *b;
 

From f70b653429ebc7fdde0b36a63e1deb4aadb450d3 Mon Sep 17 00:00:00 2001
From: "Shawn O. Pearce" <spearce@spearce.org>
Date: Mon, 15 Jan 2007 04:39:05 -0500
Subject: [PATCH 41/81] Restructure fast-import to support creating multiple
 packfiles.

Now that we are starting to see some really large projects (such
as KDE or a fork of FreeBSD) get imported into Git we're running
into the upper limit on packfile object count as well as overall
byte length.  The KDE and FreeBSD projects are both likely to
require more than 4 GiB to store their current history, which means
we really need multiple packfiles to handle their content.

This is a fairly simple restructuring of the internal code to help
us support creating multiple packfiles from within fast-import.
We are now adding a 5 digit incrementing suffix to the end of the
basename supplied to us by the caller, permitting up to 99,999
packs to be generated in a single fast-import run.

Signed-off-by: Shawn O. Pearce <spearce@spearce.org>
---
 fast-import.c | 245 +++++++++++++++++++++++++-------------------------
 1 file changed, 124 insertions(+), 121 deletions(-)

diff --git a/fast-import.c b/fast-import.c
index 3a98cb848f..fc8567e9f6 100644
--- a/fast-import.c
+++ b/fast-import.c
@@ -216,7 +216,6 @@ static unsigned long max_depth = 10;
 static unsigned long alloc_count;
 static unsigned long branch_count;
 static unsigned long branch_load_count;
-static unsigned long remap_count;
 static unsigned long object_count;
 static unsigned long duplicate_count;
 static unsigned long marks_set_count;
@@ -235,6 +234,10 @@ static unsigned int atom_cnt;
 static struct atom_str **atom_table;
 
 /* The .pack file being generated */
+static const char *base_name;
+static unsigned int pack_count;
+static char *pack_name;
+static char *idx_name;
 static struct packed_git *pack_data;
 static int pack_fd;
 static unsigned long pack_size;
@@ -593,6 +596,124 @@ static void yread(int fd, void *buffer, size_t length)
 	}
 }
 
+static void start_packfile()
+{
+	struct pack_header hdr;
+
+	pack_count++;
+	pack_name = xmalloc(strlen(base_name) + 11);
+	idx_name = xmalloc(strlen(base_name) + 11);
+	sprintf(pack_name, "%s%5.5i.pack", base_name, pack_count);
+	sprintf(idx_name, "%s%5.5i.idx", base_name, pack_count);
+
+	pack_fd = open(pack_name, O_RDWR|O_CREAT|O_EXCL, 0666);
+	if (pack_fd < 0)
+		die("Can't create %s: %s", pack_name, strerror(errno));
+
+	pack_data = xcalloc(1, sizeof(*pack_data) + strlen(pack_name) + 2);
+	strcpy(pack_data->pack_name, pack_name);
+	pack_data->pack_fd = pack_fd;
+
+	hdr.hdr_signature = htonl(PACK_SIGNATURE);
+	hdr.hdr_version = htonl(2);
+	hdr.hdr_entries = 0;
+
+	write_or_die(pack_fd, &hdr, sizeof(hdr));
+	pack_size = sizeof(hdr);
+	object_count = 0;
+}
+
+static void fixup_header_footer()
+{
+	SHA_CTX c;
+	char hdr[8];
+	unsigned long cnt;
+	char *buf;
+
+	if (lseek(pack_fd, 0, SEEK_SET) != 0)
+		die("Failed seeking to start: %s", strerror(errno));
+
+	SHA1_Init(&c);
+	yread(pack_fd, hdr, 8);
+	SHA1_Update(&c, hdr, 8);
+
+	cnt = htonl(object_count);
+	SHA1_Update(&c, &cnt, 4);
+	write_or_die(pack_fd, &cnt, 4);
+
+	buf = xmalloc(128 * 1024);
+	for (;;) {
+		size_t n = xread(pack_fd, buf, 128 * 1024);
+		if (n <= 0)
+			break;
+		SHA1_Update(&c, buf, n);
+	}
+	free(buf);
+
+	SHA1_Final(pack_sha1, &c);
+	write_or_die(pack_fd, pack_sha1, sizeof(pack_sha1));
+}
+
+static int oecmp (const void *a_, const void *b_)
+{
+	struct object_entry *a = *((struct object_entry**)a_);
+	struct object_entry *b = *((struct object_entry**)b_);
+	return hashcmp(a->sha1, b->sha1);
+}
+
+static void write_index(const char *idx_name)
+{
+	struct sha1file *f;
+	struct object_entry **idx, **c, **last, *e;
+	struct object_entry_pool *o;
+	unsigned int array[256];
+	int i;
+
+	/* Build the sorted table of object IDs. */
+	idx = xmalloc(object_count * sizeof(struct object_entry*));
+	c = idx;
+	for (o = blocks; o; o = o->next_pool)
+		for (e = o->entries; e != o->next_free; e++)
+			*c++ = e;
+	last = idx + object_count;
+	qsort(idx, object_count, sizeof(struct object_entry*), oecmp);
+
+	/* Generate the fan-out array. */
+	c = idx;
+	for (i = 0; i < 256; i++) {
+		struct object_entry **next = c;;
+		while (next < last) {
+			if ((*next)->sha1[0] != i)
+				break;
+			next++;
+		}
+		array[i] = htonl(next - idx);
+		c = next;
+	}
+
+	f = sha1create("%s", idx_name);
+	sha1write(f, array, 256 * sizeof(int));
+	for (c = idx; c != last; c++) {
+		unsigned int offset = htonl((*c)->offset);
+		sha1write(f, &offset, 4);
+		sha1write(f, (*c)->sha1, sizeof((*c)->sha1));
+	}
+	sha1write(f, pack_sha1, sizeof(pack_sha1));
+	sha1close(f, NULL, 1);
+	free(idx);
+}
+
+static void end_packfile()
+{
+	fixup_header_footer();
+	close(pack_fd);
+	write_index(idx_name);
+
+	free(pack_name);
+	free(idx_name);
+	free(pack_data);
+}
+
 static size_t encode_header(
 	enum object_type type,
 	size_t size,
@@ -994,100 +1115,6 @@ del_entry:
 	return 1;
 }
 
-static void init_pack_header()
-{
-	struct pack_header hdr;
-
-	hdr.hdr_signature = htonl(PACK_SIGNATURE);
-	hdr.hdr_version = htonl(2);
-	hdr.hdr_entries = 0;
-
-	write_or_die(pack_fd, &hdr, sizeof(hdr));
-	pack_size = sizeof(hdr);
-}
-
-static void fixup_header_footer()
-{
-	SHA_CTX c;
-	char hdr[8];
-	unsigned long cnt;
-	char *buf;
-	size_t n;
-
-	if (lseek(pack_fd, 0, SEEK_SET) != 0)
-		die("Failed seeking to start: %s", strerror(errno));
-
-	SHA1_Init(&c);
-	yread(pack_fd, hdr, 8);
-	SHA1_Update(&c, hdr, 8);
-
-	cnt = htonl(object_count);
-	SHA1_Update(&c, &cnt, 4);
-	write_or_die(pack_fd, &cnt, 4);
-
-	buf = xmalloc(128 * 1024);
-	for (;;) {
-		n = xread(pack_fd, buf, 128 * 1024);
-		if (n <= 0)
-			break;
-		SHA1_Update(&c, buf, n);
-	}
-	free(buf);
-
-	SHA1_Final(pack_sha1, &c);
-	write_or_die(pack_fd, pack_sha1, sizeof(pack_sha1));
-}
-
-static int oecmp (const void *_a, const void *_b)
-{
-	struct object_entry *a = *((struct object_entry**)_a);
-	struct object_entry *b = *((struct object_entry**)_b);
-	return hashcmp(a->sha1, b->sha1);
-}
-
-static void write_index(const char *idx_name)
-{
-	struct sha1file *f;
-	struct object_entry **idx, **c, **last;
-	struct object_entry *e;
-	struct object_entry_pool *o;
-	unsigned int array[256];
-	int i;
-
-	/* Build the sorted table of object IDs. */
-	idx = xmalloc(object_count * sizeof(struct object_entry*));
-	c = idx;
-	for (o = blocks; o; o = o->next_pool)
-		for (e = o->entries; e != o->next_free; e++)
-			*c++ = e;
-	last = idx + object_count;
-	qsort(idx, object_count, sizeof(struct object_entry*), oecmp);
-
-	/* Generate the fan-out array. */
-	c = idx;
-	for (i = 0; i < 256; i++) {
-		struct object_entry **next = c;;
-		while (next < last) {
-			if ((*next)->sha1[0] != i)
-				break;
-			next++;
-		}
-		array[i] = htonl(next - idx);
-		c = next;
-	}
-
-	f = sha1create("%s", idx_name);
-	sha1write(f, array, 256 * sizeof(int));
-	for (c = idx; c != last; c++) {
-		unsigned int offset = htonl((*c)->offset);
-		sha1write(f, &offset, 4);
-		sha1write(f, (*c)->sha1, sizeof((*c)->sha1));
-	}
-	sha1write(f, pack_sha1, sizeof(pack_sha1));
-	sha1close(f, NULL, 1);
-	free(idx);
-}
-
 static void dump_branches()
 {
 	static const char *msg = "fast-import";
@@ -1693,11 +1720,8 @@ static const char fast_import_usage[] =
 
 int main(int argc, const char **argv)
 {
-	const char *base_name;
 	int i;
 	unsigned long est_obj_cnt = object_entry_alloc;
-	char *pack_name;
-	char *idx_name;
 	struct stat sb;
 
 	setup_ident();
@@ -1728,20 +1752,6 @@ int main(int argc, const char **argv)
 		usage(fast_import_usage);
 	base_name = argv[i];
 
-	pack_name = xmalloc(strlen(base_name) + 6);
-	sprintf(pack_name, "%s.pack", base_name);
-	idx_name = xmalloc(strlen(base_name) + 5);
-	sprintf(idx_name, "%s.idx", base_name);
-
-	pack_fd = open(pack_name, O_RDWR|O_CREAT|O_EXCL, 0666);
-	if (pack_fd < 0)
-		die("Can't create %s: %s", pack_name, strerror(errno));
-
-	pack_data = xcalloc(1, sizeof(*pack_data) + strlen(pack_name) + 2);
-	strcpy(pack_data->pack_name, pack_name);
-	pack_data->pack_fd = pack_fd;
-
-	init_pack_header();
 	alloc_objects(est_obj_cnt);
 	strbuf_init(&command_buf);
 
@@ -1750,6 +1760,7 @@ int main(int argc, const char **argv)
 	avail_tree_table = xcalloc(avail_tree_table_sz, sizeof(struct avail_tree_content*));
 	marks = pool_calloc(1, sizeof(struct mark_set));
 
+	start_packfile();
 	for (;;) {
 		read_next_command();
 		if (command_buf.eof)
@@ -1765,10 +1776,8 @@ int main(int argc, const char **argv)
 		else
 			die("Unsupported command: %s", command_buf.buf);
 	}
+	end_packfile();
 
-	fixup_header_footer();
-	close(pack_fd);
-	write_index(idx_name);
 	dump_branches();
 	dump_tags();
 	dump_marks();
@@ -1789,13 +1798,7 @@ int main(int argc, const char **argv)
 	fprintf(stderr, "Memory total:    %10lu KiB\n", (total_allocd + alloc_count*sizeof(struct object_entry))/1024);
 	fprintf(stderr, "       pools:    %10lu KiB\n", total_allocd/1024);
 	fprintf(stderr, "     objects:    %10lu KiB\n", (alloc_count*sizeof(struct object_entry))/1024);
-	fprintf(stderr, "Pack remaps:     %10lu\n", remap_count);
-	stat(pack_name, &sb);
-	fprintf(stderr, "Pack size:       %10lu KiB\n", (unsigned long)(sb.st_size/1024));
-	stat(idx_name, &sb);
-	fprintf(stderr, "Index size:      %10lu KiB\n", (unsigned long)(sb.st_size/1024));
 	fprintf(stderr, "---------------------------------------------------------------------\n");
-
 	fprintf(stderr, "\n");
 
 	return 0;

From 80144727acc401070039434987692276dcb9273c Mon Sep 17 00:00:00 2001
From: "Shawn O. Pearce" <spearce@spearce.org>
Date: Mon, 15 Jan 2007 05:03:32 -0500
Subject: [PATCH 42/81] Remove unnecessary duplicate_count in fast-import.

There is little reason to be keeping a global duplicate_count
value when we also keep it per object type.  The global counter can
easily be computed at the end, once all processing has completed.
This saves us a couple of machine instructions in an unimportant
part of code.  But it looks slightly better to me to not keep
two counters around.

Signed-off-by: Shawn O. Pearce <spearce@spearce.org>
---
 fast-import.c | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/fast-import.c b/fast-import.c
index fc8567e9f6..12127168bd 100644
--- a/fast-import.c
+++ b/fast-import.c
@@ -217,7 +217,6 @@ static unsigned long alloc_count;
 static unsigned long branch_count;
 static unsigned long branch_load_count;
 static unsigned long object_count;
-static unsigned long duplicate_count;
 static unsigned long marks_set_count;
 static unsigned long object_count_by_type[1 << TYPE_BITS];
 static unsigned long duplicate_count_by_type[1 << TYPE_BITS];
@@ -765,7 +764,6 @@ static int store_object(
 	if (mark)
 		insert_mark(mark, e);
 	if (e->offset) {
-		duplicate_count++;
 		duplicate_count_by_type[type]++;
 		return 1;
 	}
@@ -1722,7 +1720,7 @@ int main(int argc, const char **argv)
 {
 	int i;
 	unsigned long est_obj_cnt = object_entry_alloc;
-	struct stat sb;
+	unsigned long duplicate_count;
 
 	setup_ident();
 	git_config(git_default_config);
@@ -1784,6 +1782,9 @@ int main(int argc, const char **argv)
 	if (branch_log)
 		fclose(branch_log);
 
+	for (i = 0; i < ARRAY_SIZE(duplicate_count_by_type); i++)
+		duplicate_count += duplicate_count_by_type[i];
+
 	fprintf(stderr, "%s statistics:\n", argv[0]);
 	fprintf(stderr, "---------------------------------------------------------------------\n");
 	fprintf(stderr, "Alloc'd objects: %10lu (%10lu overflow  )\n", alloc_count, alloc_count - est_obj_cnt);

From 7bfe6e261378a30980886994dabc0e7e4c9ce3d8 Mon Sep 17 00:00:00 2001
From: "Shawn O. Pearce" <spearce@spearce.org>
Date: Mon, 15 Jan 2007 06:35:41 -0500
Subject: [PATCH 43/81] Implemented manual packfile switching in fast-import.

To help importers which are dealing with massive amounts of data
fast-import needs to be able to close the packfile it is currently
writing to and open a new packfile for any additional data that
will be received.  A new 'checkpoint' command has been introduced
which can be used by the frontend import process to force this
to occur at any time.  This may be useful to ensure a very long
running import doesn't lose any work due to unexpected failures.

Signed-off-by: Shawn O. Pearce <spearce@spearce.org>
---
 fast-import.c | 89 +++++++++++++++++++++++++++++++++++++--------------
 1 file changed, 65 insertions(+), 24 deletions(-)

diff --git a/fast-import.c b/fast-import.c
index 12127168bd..3f747be287 100644
--- a/fast-import.c
+++ b/fast-import.c
@@ -40,6 +40,9 @@ Format of STDIN stream:
     ('from' sp (ref_str | hexsha1 | sha1exp_str | idnum) lf)?
     lf;
 
+  checkpoint ::= 'checkpoint' lf
+    lf;
+
      # note: the first idnum in a stream should be 1 and subsequent
      # idnums should not have gaps between values as this will cause
      # the stream parser to reserve space for the gapped values.  An
@@ -112,6 +115,7 @@ struct object_entry
 	struct object_entry *next;
 	unsigned long offset;
 	unsigned type : TYPE_BITS;
+	unsigned pack_id : 16;
 	unsigned char sha1[20];
 };
 
@@ -234,10 +238,10 @@ static struct atom_str **atom_table;
 
 /* The .pack file being generated */
 static const char *base_name;
-static unsigned int pack_count;
-static char *pack_name;
+static unsigned int pack_id;
 static char *idx_name;
 static struct packed_git *pack_data;
+static struct packed_git **all_packs;
 static int pack_fd;
 static unsigned long pack_size;
 static unsigned char pack_sha1[20];
@@ -299,6 +303,7 @@ static struct object_entry* new_object(unsigned char *sha1)
 		alloc_objects(object_entry_alloc);
 
 	e = blocks->next_free++;
+	e->pack_id = pack_id;
 	hashcpy(e->sha1, sha1);
 	return e;
 }
@@ -597,29 +602,30 @@ static void yread(int fd, void *buffer, size_t length)
 
 static void start_packfile()
 {
+	struct packed_git *p;
 	struct pack_header hdr;
 
-	pack_count++;
-	pack_name = xmalloc(strlen(base_name) + 11);
 	idx_name = xmalloc(strlen(base_name) + 11);
-	sprintf(pack_name, "%s%5.5i.pack", base_name, pack_count);
-	sprintf(idx_name, "%s%5.5i.idx", base_name, pack_count);
+	p = xcalloc(1, sizeof(*p) + strlen(base_name) + 13);
+	sprintf(p->pack_name, "%s%5.5i.pack", base_name, pack_id + 1);
+	sprintf(idx_name, "%s%5.5i.idx", base_name, pack_id + 1);
 
-	pack_fd = open(pack_name, O_RDWR|O_CREAT|O_EXCL, 0666);
+	pack_fd = open(p->pack_name, O_RDWR|O_CREAT|O_EXCL, 0666);
 	if (pack_fd < 0)
-		die("Can't create %s: %s", pack_name, strerror(errno));
-
-	pack_data = xcalloc(1, sizeof(*pack_data) + strlen(pack_name) + 2);
-	strcpy(pack_data->pack_name, pack_name);
-	pack_data->pack_fd = pack_fd;
+		die("Can't create %s: %s", p->pack_name, strerror(errno));
+	p->pack_fd = pack_fd;
 
 	hdr.hdr_signature = htonl(PACK_SIGNATURE);
 	hdr.hdr_version = htonl(2);
 	hdr.hdr_entries = 0;
-
 	write_or_die(pack_fd, &hdr, sizeof(hdr));
+
+	pack_data = p;
 	pack_size = sizeof(hdr);
 	object_count = 0;
+
+	all_packs = xrealloc(all_packs, sizeof(*all_packs) * (pack_id + 1));
+	all_packs[pack_id] = p;
 }
 
 static void fixup_header_footer()
@@ -673,7 +679,8 @@ static void write_index(const char *idx_name)
 	c = idx;
 	for (o = blocks; o; o = o->next_pool)
 		for (e = o->entries; e != o->next_free; e++)
-			*c++ = e;
+			if (pack_id == e->pack_id)
+				*c++ = e;
 	last = idx + object_count;
 	qsort(idx, object_count, sizeof(struct object_entry*), oecmp);
 
@@ -704,13 +711,28 @@ static void write_index(const char *idx_name)
 
 static void end_packfile()
 {
+	struct packed_git *old_p = pack_data, *new_p;
+
 	fixup_header_footer();
-	close(pack_fd);
 	write_index(idx_name);
 
-	free(pack_name);
+	/* Register the packfile with core git's machinary. */
+	new_p = add_packed_git(idx_name, strlen(idx_name), 1);
+	if (!new_p)
+		die("core git rejected index %s", idx_name);
+	new_p->windows = old_p->windows;
+	new_p->pack_fd = old_p->pack_fd;
+	all_packs[pack_id++] = new_p;
+	install_packed_git(new_p);
+	free(old_p);
 	free(idx_name);
-	free(pack_data);
+
+	/* We can't carry a delta across packfiles. */
+	free(last_blob.data);
+	last_blob.data = NULL;
+	last_blob.len = 0;
+	last_blob.offset = 0;
+	last_blob.depth = 0;
 }
 
 static size_t encode_header(
@@ -832,11 +854,15 @@ static int store_object(
 	return 0;
 }
 
-static void *gfi_unpack_entry(unsigned long ofs, unsigned long *sizep)
+static void *gfi_unpack_entry(
+	struct object_entry *oe,
+	unsigned long *sizep)
 {
-	char type[20];
-	pack_data->pack_size = pack_size + 20;
-	return unpack_entry(pack_data, ofs, type, sizep);
+	static char type[20];
+	struct packed_git *p = all_packs[oe->pack_id];
+	if (p == pack_data)
+		p->pack_size = pack_size + 20;
+	return unpack_entry(p, oe->offset, type, sizep);
 }
 
 static const char *get_mode(const char *str, unsigned int *modep)
@@ -871,7 +897,7 @@ static void load_tree(struct tree_entry *root)
 		if (myoe->type != OBJ_TREE)
 			die("Not a tree: %s", sha1_to_hex(sha1));
 		t->delta_depth = 0;
-		buf = gfi_unpack_entry(myoe->offset, &size);
+		buf = gfi_unpack_entry(myoe, &size);
 	} else {
 		char type[20];
 		buf = read_sha1_file(sha1, type, &size);
@@ -971,7 +997,9 @@ static void store_tree(struct tree_entry *root)
 	}
 
 	le = find_object(root->versions[0].sha1);
-	if (!S_ISDIR(root->versions[0].mode) || !le) {
+	if (!S_ISDIR(root->versions[0].mode)
+		|| !le
+		|| le->pack_id != pack_id) {
 		lo.data = NULL;
 		lo.depth = 0;
 	} else {
@@ -1385,7 +1413,7 @@ static void cmd_from(struct branch *b)
 		if (oe->type != OBJ_COMMIT)
 			die("Mark :%lu not a commit", idnum);
 		hashcpy(b->sha1, oe->sha1);
-		buf = gfi_unpack_entry(oe->offset, &size);
+		buf = gfi_unpack_entry(oe, &size);
 		if (!buf || size < 46)
 			die("Not a valid commit: %s", from);
 		if (memcmp("tree ", buf, 5)
@@ -1713,6 +1741,15 @@ static void cmd_reset_branch()
 	cmd_from(b);
 }
 
+static void cmd_checkpoint()
+{
+	if (object_count) {
+		end_packfile();
+		start_packfile();
+	}
+	read_next_command();
+}
+
 static const char fast_import_usage[] =
 "git-fast-import [--objects=n] [--depth=n] [--active-branches=n] [--export-marks=marks.file] [--branch-log=log] temp.pack";
 
@@ -1771,6 +1808,8 @@ int main(int argc, const char **argv)
 			cmd_new_tag();
 		else if (!strncmp("reset ", command_buf.buf, 6))
 			cmd_reset_branch();
+		else if (!strcmp("checkpoint", command_buf.buf))
+			cmd_checkpoint();
 		else
 			die("Unsupported command: %s", command_buf.buf);
 	}
@@ -1800,6 +1839,8 @@ int main(int argc, const char **argv)
 	fprintf(stderr, "       pools:    %10lu KiB\n", total_allocd/1024);
 	fprintf(stderr, "     objects:    %10lu KiB\n", (alloc_count*sizeof(struct object_entry))/1024);
 	fprintf(stderr, "---------------------------------------------------------------------\n");
+	pack_report();
+	fprintf(stderr, "---------------------------------------------------------------------\n");
 	fprintf(stderr, "\n");
 
 	return 0;

From 3e005baf8542a3116e51c4b0a27b72c7e14d949b Mon Sep 17 00:00:00 2001
From: "Shawn O. Pearce" <spearce@spearce.org>
Date: Mon, 15 Jan 2007 06:39:39 -0500
Subject: [PATCH 44/81] Don't create a final empty packfile in fast-import.

If the last packfile is going to be empty (has 0 objects) then it
shouldn't be kept after the import has terminated, as there is no
point to the packfile.  So rather than hashing it and making the
index file, just delete the packfile.

Signed-off-by: Shawn O. Pearce <spearce@spearce.org>
---
 fast-import.c | 26 ++++++++++++++++----------
 1 file changed, 16 insertions(+), 10 deletions(-)

diff --git a/fast-import.c b/fast-import.c
index 3f747be287..207acb3230 100644
--- a/fast-import.c
+++ b/fast-import.c
@@ -713,17 +713,23 @@ static void end_packfile()
 {
 	struct packed_git *old_p = pack_data, *new_p;
 
-	fixup_header_footer();
-	write_index(idx_name);
+	if (object_count) {
+		fixup_header_footer();
+		write_index(idx_name);
 
-	/* Register the packfile with core git's machinary. */
-	new_p = add_packed_git(idx_name, strlen(idx_name), 1);
-	if (!new_p)
-		die("core git rejected index %s", idx_name);
-	new_p->windows = old_p->windows;
-	new_p->pack_fd = old_p->pack_fd;
-	all_packs[pack_id++] = new_p;
-	install_packed_git(new_p);
+		/* Register the packfile with core git's machinary. */
+		new_p = add_packed_git(idx_name, strlen(idx_name), 1);
+		if (!new_p)
+			die("core git rejected index %s", idx_name);
+		new_p->windows = old_p->windows;
+		new_p->pack_fd = old_p->pack_fd;
+		all_packs[pack_id++] = new_p;
+		install_packed_git(new_p);
+	}
+	else {
+		close(pack_fd);
+		unlink(old_p->pack_name);
+	}
 	free(old_p);
 	free(idx_name);
 

From 2fce1f3c862845d23b2bd8305f97abb115623192 Mon Sep 17 00:00:00 2001
From: "Shawn O. Pearce" <spearce@spearce.org>
Date: Mon, 15 Jan 2007 06:51:58 -0500
Subject: [PATCH 45/81] Optimize index creation on large object sets in
 fast-import.

When we are generating multiple packfiles at once we only need
to scan the blocks of object_entry structs which contain objects
for the current packfile.  Because the most recent blocks are at
the front of the linked list, and because all new objects going
into the current file are allocated from the front of that list,
we can stop scanning for objects as soon as we identify one which
doesn't belong to the current packfile.

Signed-off-by: Shawn O. Pearce <spearce@spearce.org>
---
 fast-import.c | 11 ++++++++---
 1 file changed, 8 insertions(+), 3 deletions(-)

diff --git a/fast-import.c b/fast-import.c
index 207acb3230..cfadda0432 100644
--- a/fast-import.c
+++ b/fast-import.c
@@ -678,10 +678,15 @@ static void write_index(const char *idx_name)
 	idx = xmalloc(object_count * sizeof(struct object_entry*));
 	c = idx;
 	for (o = blocks; o; o = o->next_pool)
-		for (e = o->entries; e != o->next_free; e++)
-			if (pack_id == e->pack_id)
-				*c++ = e;
+		for (e = o->next_free; e-- != o->entries;) {
+			if (pack_id != e->pack_id)
+				goto sort_index;
+			*c++ = e;
+		}
+sort_index:
 	last = idx + object_count;
+	if (c != last)
+		die("internal consistency error creating the index");
 	qsort(idx, object_count, sizeof(struct object_entry*), oecmp);
 
 	/* Generate the fan-out array. */

From d9ee53ce45b0f1c26285417b900b3c5735721f7e Mon Sep 17 00:00:00 2001
From: "Shawn O. Pearce" <spearce@spearce.org>
Date: Mon, 15 Jan 2007 08:00:49 -0500
Subject: [PATCH 46/81] Implemented automatic checkpoints within fast-import.

When the number of objects or number of bytes gets close to the limit
allowed by the packfile format (or configured on the command line by
our caller) we should automatically checkpoint the current packfile
and start a new one before writing the object out.  This does however
require that we abandon the delta (if we had one) as its not valid
in a new packfile.

I also added the simple rule that if we got a delta back but the
delta itself is the same size as or larger than the uncompressed
object to ignore the delta and just store the object data.  This
should avoid some really bad behavior caused by our current delta
strategy.

Signed-off-by: Shawn O. Pearce <spearce@spearce.org>
---
 fast-import.c | 96 ++++++++++++++++++++++++++++++++++++---------------
 1 file changed, 68 insertions(+), 28 deletions(-)

diff --git a/fast-import.c b/fast-import.c
index cfadda0432..c19567f68c 100644
--- a/fast-import.c
+++ b/fast-import.c
@@ -217,6 +217,8 @@ struct hash_list
 
 /* Stats and misc. counters */
 static unsigned long max_depth = 10;
+static unsigned long max_objects = -1;
+static unsigned long max_packsize = -1;
 static unsigned long alloc_count;
 static unsigned long branch_count;
 static unsigned long branch_load_count;
@@ -303,7 +305,6 @@ static struct object_entry* new_object(unsigned char *sha1)
 		alloc_objects(object_entry_alloc);
 
 	e = blocks->next_free++;
-	e->pack_id = pack_id;
 	hashcpy(e->sha1, sha1);
 	return e;
 }
@@ -678,12 +679,9 @@ static void write_index(const char *idx_name)
 	idx = xmalloc(object_count * sizeof(struct object_entry*));
 	c = idx;
 	for (o = blocks; o; o = o->next_pool)
-		for (e = o->next_free; e-- != o->entries;) {
-			if (pack_id != e->pack_id)
-				goto sort_index;
-			*c++ = e;
-		}
-sort_index:
+		for (e = o->next_free; e-- != o->entries;)
+			if (pack_id == e->pack_id)
+				*c++ = e;
 	last = idx + object_count;
 	if (c != last)
 		die("internal consistency error creating the index");
@@ -746,6 +744,12 @@ static void end_packfile()
 	last_blob.depth = 0;
 }
 
+static void checkpoint()
+{
+	end_packfile();
+	start_packfile();
+}
+
 static size_t encode_header(
 	enum object_type type,
 	size_t size,
@@ -800,20 +804,64 @@ static int store_object(
 		duplicate_count_by_type[type]++;
 		return 1;
 	}
-	e->type = type;
-	e->offset = pack_size;
-	object_count++;
-	object_count_by_type[type]++;
 
-	if (last && last->data && last->depth < max_depth)
+	if (last && last->data && last->depth < max_depth) {
 		delta = diff_delta(last->data, last->len,
 			dat, datlen,
 			&deltalen, 0);
-	else
-		delta = 0;
+		if (delta && deltalen >= datlen) {
+			free(delta);
+			delta = NULL;
+		}
+	} else
+		delta = NULL;
 
 	memset(&s, 0, sizeof(s));
 	deflateInit(&s, zlib_compression_level);
+	if (delta) {
+		s.next_in = delta;
+		s.avail_in = deltalen;
+	} else {
+		s.next_in = dat;
+		s.avail_in = datlen;
+	}
+	s.avail_out = deflateBound(&s, s.avail_in);
+	s.next_out = out = xmalloc(s.avail_out);
+	while (deflate(&s, Z_FINISH) == Z_OK)
+		/* nothing */;
+	deflateEnd(&s);
+
+	/* Determine if we should auto-checkpoint. */
+	if ((object_count + 1) > max_objects
+		|| (object_count + 1) < object_count
+		|| (pack_size + 60 + s.total_out) > max_packsize
+		|| (pack_size + 60 + s.total_out) < pack_size) {
+
+		/* This new object needs to *not* have the current pack_id. */
+		e->pack_id = pack_id + 1;
+		checkpoint();
+
+		/* We cannot carry a delta into the new pack. */
+		if (delta) {
+			free(delta);
+			delta = NULL;
+		}
+		memset(&s, 0, sizeof(s));
+		deflateInit(&s, zlib_compression_level);
+		s.next_in = dat;
+		s.avail_in = datlen;
+		s.avail_out = deflateBound(&s, s.avail_in);
+		s.next_out = out;
+		while (deflate(&s, Z_FINISH) == Z_OK)
+			/* nothing */;
+		deflateEnd(&s);
+	}
+
+	e->type = type;
+	e->pack_id = pack_id;
+	e->offset = pack_size;
+	object_count++;
+	object_count_by_type[type]++;
 
 	if (delta) {
 		unsigned long ofs = e->offset - last->offset;
@@ -821,8 +869,6 @@ static int store_object(
 
 		delta_count_by_type[type]++;
 		last->depth++;
-		s.next_in = delta;
-		s.avail_in = deltalen;
 
 		hdrlen = encode_header(OBJ_OFS_DELTA, deltalen, hdr);
 		write_or_die(pack_fd, hdr, hdrlen);
@@ -836,19 +882,11 @@ static int store_object(
 	} else {
 		if (last)
 			last->depth = 0;
-		s.next_in = dat;
-		s.avail_in = datlen;
 		hdrlen = encode_header(type, datlen, hdr);
 		write_or_die(pack_fd, hdr, hdrlen);
 		pack_size += hdrlen;
 	}
 
-	s.avail_out = deflateBound(&s, s.avail_in);
-	s.next_out = out = xmalloc(s.avail_out);
-	while (deflate(&s, Z_FINISH) == Z_OK)
-		/* nothing */;
-	deflateEnd(&s);
-
 	write_or_die(pack_fd, out, s.total_out);
 	pack_size += s.total_out;
 
@@ -1754,10 +1792,8 @@ static void cmd_reset_branch()
 
 static void cmd_checkpoint()
 {
-	if (object_count) {
-		end_packfile();
-		start_packfile();
-	}
+	if (object_count)
+		checkpoint();
 	read_next_command();
 }
 
@@ -1780,6 +1816,10 @@ int main(int argc, const char **argv)
 			break;
 		else if (!strncmp(a, "--objects=", 10))
 			est_obj_cnt = strtoul(a + 10, NULL, 0);
+		else if (!strncmp(a, "--max-objects-per-pack=", 23))
+			max_objects = strtoul(a + 23, NULL, 0);
+		else if (!strncmp(a, "--max-pack-size=", 16))
+			max_packsize = strtoul(a + 16, NULL, 0) * 1024 * 1024;
 		else if (!strncmp(a, "--depth=", 8))
 			max_depth = strtoul(a + 8, NULL, 0);
 		else if (!strncmp(a, "--active-branches=", 18))

From 9d1b1b5ed7f4234ea4f2c1344ba67c6f89e2067c Mon Sep 17 00:00:00 2001
From: "Shawn O. Pearce" <spearce@spearce.org>
Date: Mon, 15 Jan 2007 08:03:38 -0500
Subject: [PATCH 47/81] Print the packfile names to stdout from fast-import.

Caller scripts may want to know what packfiles the fast-import
process just wrote out for them.  This is now output to stdout,
one packfile name per line, after we checkpoint each packfile.

Signed-off-by: Shawn O. Pearce <spearce@spearce.org>
---
 fast-import.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/fast-import.c b/fast-import.c
index c19567f68c..19d01e20ad 100644
--- a/fast-import.c
+++ b/fast-import.c
@@ -719,6 +719,8 @@ static void end_packfile()
 	if (object_count) {
 		fixup_header_footer();
 		write_index(idx_name);
+		fprintf(stdout, "%s\n", old_p->pack_name);
+		fflush(stdout);
 
 		/* Register the packfile with core git's machinary. */
 		new_p = add_packed_git(idx_name, strlen(idx_name), 1);

From 5d6f3ef6413172388ee5e6090afe9802a30a59f0 Mon Sep 17 00:00:00 2001
From: "Shawn O. Pearce" <spearce@spearce.org>
Date: Mon, 15 Jan 2007 23:40:27 -0500
Subject: [PATCH 48/81] Corrected buffer overflow during automatic checkpoint
 in fast-import.

If we previously were using a delta but we needed to checkpoint the
current packfile and switch to a new packfile we need to throw away
the delta and compress the raw object by itself, as delta chains
cannot span non-thin packfiles.  Unfortunately the output buffer
in this case needs to grow, as the size of the compressed object
may be quite a bit larger than the size of the compressed delta.

I've also avoided recompressing the object if we are checkpointing
and we didn't use a delta.  In this case the output buffer is the
correct size and has already been populated with the right data,
we just need to close out the current packfile and open a new one.

Signed-off-by: Shawn O. Pearce <spearce@spearce.org>
---
 fast-import.c | 19 ++++++++++---------
 1 file changed, 10 insertions(+), 9 deletions(-)

diff --git a/fast-import.c b/fast-import.c
index 19d01e20ad..57d857c386 100644
--- a/fast-import.c
+++ b/fast-import.c
@@ -847,16 +847,17 @@ static int store_object(
 		if (delta) {
 			free(delta);
 			delta = NULL;
+
+			memset(&s, 0, sizeof(s));
+			deflateInit(&s, zlib_compression_level);
+			s.next_in = dat;
+			s.avail_in = datlen;
+			s.avail_out = deflateBound(&s, s.avail_in);
+			s.next_out = out = xrealloc(out, s.avail_out);
+			while (deflate(&s, Z_FINISH) == Z_OK)
+				/* nothing */;
+			deflateEnd(&s);
 		}
-		memset(&s, 0, sizeof(s));
-		deflateInit(&s, zlib_compression_level);
-		s.next_in = dat;
-		s.avail_in = datlen;
-		s.avail_out = deflateBound(&s, s.avail_in);
-		s.next_out = out;
-		while (deflate(&s, Z_FINISH) == Z_OK)
-			/* nothing */;
-		deflateEnd(&s);
 	}
 
 	e->type = type;

From 0ea9f045f4eaa1d37c6b318d9d6849a4f447b997 Mon Sep 17 00:00:00 2001
From: "Shawn O. Pearce" <spearce@spearce.org>
Date: Tue, 16 Jan 2007 00:33:19 -0500
Subject: [PATCH 49/81] Use uintmax_t for marks in fast-import.

If a frontend wants to use a mark per file revision and per commit
and is doing a truly huge import (such as a 32 GiB SVN repository)
we may need more than 2**32 unique mark values, especially if the
frontend is unable (or unwilling) to recycle mark values.  For mark
idnums we should use the largest unsigned integer type available,
hoping that will be at least 64 bits when we are compiled as a 64
bit executable.  This way we may consume huge amounts of memory
storing our mark table, but we'll at least be able to process
the entire import without failing.

Signed-off-by: Shawn O. Pearce <spearce@spearce.org>
---
 fast-import.c     | 91 ++++++++++++++++++++++++-----------------------
 git-compat-util.h |  1 +
 2 files changed, 48 insertions(+), 44 deletions(-)

diff --git a/fast-import.c b/fast-import.c
index 57d857c386..ebffa7c904 100644
--- a/fast-import.c
+++ b/fast-import.c
@@ -215,18 +215,20 @@ struct hash_list
 	unsigned char sha1[20];
 };
 
-/* Stats and misc. counters */
+/* Configured limits on output */
 static unsigned long max_depth = 10;
-static unsigned long max_objects = -1;
 static unsigned long max_packsize = -1;
-static unsigned long alloc_count;
+static uintmax_t max_objects = -1;
+
+/* Stats and misc. counters */
+static uintmax_t alloc_count;
+static uintmax_t object_count;
+static uintmax_t marks_set_count;
+static uintmax_t object_count_by_type[1 << TYPE_BITS];
+static uintmax_t duplicate_count_by_type[1 << TYPE_BITS];
+static uintmax_t delta_count_by_type[1 << TYPE_BITS];
 static unsigned long branch_count;
 static unsigned long branch_load_count;
-static unsigned long object_count;
-static unsigned long marks_set_count;
-static unsigned long object_count_by_type[1 << TYPE_BITS];
-static unsigned long duplicate_count_by_type[1 << TYPE_BITS];
-static unsigned long delta_count_by_type[1 << TYPE_BITS];
 
 /* Memory pools */
 static size_t mem_pool_alloc = 2*1024*1024 - sizeof(struct mem_pool);
@@ -279,7 +281,7 @@ static struct tag *last_tag;
 
 /* Input stream parsing */
 static struct strbuf command_buf;
-static unsigned long next_mark;
+static uintmax_t next_mark;
 static struct dbuf new_data;
 static FILE* branch_log;
 
@@ -406,7 +408,7 @@ static void size_dbuf(struct dbuf *b, size_t maxlen)
 	b->buffer = xmalloc(b->capacity);
 }
 
-static void insert_mark(unsigned long idnum, struct object_entry *oe)
+static void insert_mark(uintmax_t idnum, struct object_entry *oe)
 {
 	struct mark_set *s = marks;
 	while ((idnum >> s->shift) >= 1024) {
@@ -416,7 +418,7 @@ static void insert_mark(unsigned long idnum, struct object_entry *oe)
 		marks = s;
 	}
 	while (s->shift) {
-		unsigned long i = idnum >> s->shift;
+		uintmax_t i = idnum >> s->shift;
 		idnum -= i << s->shift;
 		if (!s->data.sets[i]) {
 			s->data.sets[i] = pool_calloc(1, sizeof(struct mark_set));
@@ -429,14 +431,14 @@ static void insert_mark(unsigned long idnum, struct object_entry *oe)
 	s->data.marked[idnum] = oe;
 }
 
-static struct object_entry* find_mark(unsigned long idnum)
+static struct object_entry* find_mark(uintmax_t idnum)
 {
-	unsigned long orig_idnum = idnum;
+	uintmax_t orig_idnum = idnum;
 	struct mark_set *s = marks;
 	struct object_entry *oe = NULL;
 	if ((idnum >> s->shift) < 1024) {
 		while (s && s->shift) {
-			unsigned long i = idnum >> s->shift;
+			uintmax_t i = idnum >> s->shift;
 			idnum -= i << s->shift;
 			s = s->data.sets[i];
 		}
@@ -444,7 +446,7 @@ static struct object_entry* find_mark(unsigned long idnum)
 			oe = s->data.marked[idnum];
 	}
 	if (!oe)
-		die("mark :%lu not declared", orig_idnum);
+		die("mark :%ju not declared", orig_idnum);
 	return oe;
 }
 
@@ -781,7 +783,7 @@ static int store_object(
 	size_t datlen,
 	struct last_object *last,
 	unsigned char *sha1out,
-	unsigned long mark)
+	uintmax_t mark)
 {
 	void *out, *delta;
 	struct object_entry *e;
@@ -1225,10 +1227,10 @@ static void dump_tags()
 }
 
 static void dump_marks_helper(FILE *f,
-	unsigned long base,
+	uintmax_t base,
 	struct mark_set *m)
 {
-	int k;
+	uintmax_t k;
 	if (m->shift) {
 		for (k = 0; k < 1024; k++) {
 			if (m->data.sets[k])
@@ -1238,7 +1240,7 @@ static void dump_marks_helper(FILE *f,
 	} else {
 		for (k = 0; k < 1024; k++) {
 			if (m->data.marked[k])
-				fprintf(f, ":%lu %s\n", base + k,
+				fprintf(f, ":%ju %s\n", base + k,
 					sha1_to_hex(m->data.marked[k]->sha1));
 		}
 	}
@@ -1262,7 +1264,7 @@ static void read_next_command()
 static void cmd_mark()
 {
 	if (!strncmp("mark :", command_buf.buf, 6)) {
-		next_mark = strtoul(command_buf.buf + 6, NULL, 10);
+		next_mark = strtoumax(command_buf.buf + 6, NULL, 10);
 		read_next_command();
 	}
 	else
@@ -1375,7 +1377,7 @@ static void file_change_m(struct branch *b)
 
 	if (*p == ':') {
 		char *x;
-		oe = find_mark(strtoul(p + 1, &x, 10));
+		oe = find_mark(strtoumax(p + 1, &x, 10));
 		hashcpy(sha1, oe->sha1);
 		p = x;
 	} else {
@@ -1458,12 +1460,12 @@ static void cmd_from(struct branch *b)
 		hashcpy(b->branch_tree.versions[0].sha1, t);
 		hashcpy(b->branch_tree.versions[1].sha1, t);
 	} else if (*from == ':') {
-		unsigned long idnum = strtoul(from + 1, NULL, 10);
+		uintmax_t idnum = strtoumax(from + 1, NULL, 10);
 		struct object_entry *oe = find_mark(idnum);
 		unsigned long size;
 		char *buf;
 		if (oe->type != OBJ_COMMIT)
-			die("Mark :%lu not a commit", idnum);
+			die("Mark :%ju not a commit", idnum);
 		hashcpy(b->sha1, oe->sha1);
 		buf = gfi_unpack_entry(oe, &size);
 		if (!buf || size < 46)
@@ -1521,10 +1523,10 @@ static struct hash_list* cmd_merge(unsigned int *count)
 		if (s)
 			hashcpy(n->sha1, s->sha1);
 		else if (*from == ':') {
-			unsigned long idnum = strtoul(from + 1, NULL, 10);
+			uintmax_t idnum = strtoumax(from + 1, NULL, 10);
 			struct object_entry *oe = find_mark(idnum);
 			if (oe->type != OBJ_COMMIT)
-				die("Mark :%lu not a commit", idnum);
+				die("Mark :%ju not a commit", idnum);
 			hashcpy(n->sha1, oe->sha1);
 		} else if (get_sha1(from, n->sha1))
 			die("Invalid ref name or SHA1 expression: %s", from);
@@ -1650,7 +1652,7 @@ static void cmd_new_commit()
 			fputc('"', branch_log);
 		} else
 			fprintf(branch_log, "%s", b->name);
-		fprintf(branch_log," :%lu %s\n",next_mark,sha1_to_hex(b->sha1));
+		fprintf(branch_log," :%ju %s\n",next_mark,sha1_to_hex(b->sha1));
 	}
 }
 
@@ -1665,7 +1667,7 @@ static void cmd_new_tag()
 	void *msg;
 	size_t msglen;
 	struct tag *t;
-	unsigned long from_mark = 0;
+	uintmax_t from_mark = 0;
 	unsigned char sha1[20];
 
 	/* Obtain the new tag name from the rest of our command */
@@ -1704,10 +1706,10 @@ static void cmd_new_tag()
 	if (s) {
 		hashcpy(sha1, s->sha1);
 	} else if (*from == ':') {
-		from_mark = strtoul(from + 1, NULL, 10);
+		from_mark = strtoumax(from + 1, NULL, 10);
 		struct object_entry *oe = find_mark(from_mark);
 		if (oe->type != OBJ_COMMIT)
-			die("Mark :%lu not a commit", from_mark);
+			die("Mark :%ju not a commit", from_mark);
 		hashcpy(sha1, oe->sha1);
 	} else if (!get_sha1(from, sha1)) {
 		unsigned long size;
@@ -1758,7 +1760,7 @@ static void cmd_new_tag()
 			fputc('"', branch_log);
 		} else
 			fprintf(branch_log, "%s", t->name);
-		fprintf(branch_log," :%lu %s\n",from_mark,sha1_to_hex(t->sha1));
+		fprintf(branch_log," :%ju %s\n",from_mark,sha1_to_hex(t->sha1));
 	}
 }
 
@@ -1806,8 +1808,8 @@ static const char fast_import_usage[] =
 int main(int argc, const char **argv)
 {
 	int i;
-	unsigned long est_obj_cnt = object_entry_alloc;
-	unsigned long duplicate_count;
+	uintmax_t est_obj_cnt = object_entry_alloc;
+	uintmax_t duplicate_count;
 
 	setup_ident();
 	git_config(git_default_config);
@@ -1818,11 +1820,11 @@ int main(int argc, const char **argv)
 		if (*a != '-' || !strcmp(a, "--"))
 			break;
 		else if (!strncmp(a, "--objects=", 10))
-			est_obj_cnt = strtoul(a + 10, NULL, 0);
+			est_obj_cnt = strtoumax(a + 10, NULL, 0);
 		else if (!strncmp(a, "--max-objects-per-pack=", 23))
-			max_objects = strtoul(a + 23, NULL, 0);
+			max_objects = strtoumax(a + 23, NULL, 0);
 		else if (!strncmp(a, "--max-pack-size=", 16))
-			max_packsize = strtoul(a + 16, NULL, 0) * 1024 * 1024;
+			max_packsize = strtoumax(a + 16, NULL, 0) * 1024 * 1024;
 		else if (!strncmp(a, "--depth=", 8))
 			max_depth = strtoul(a + 8, NULL, 0);
 		else if (!strncmp(a, "--active-branches=", 18))
@@ -1875,23 +1877,24 @@ int main(int argc, const char **argv)
 	if (branch_log)
 		fclose(branch_log);
 
+	duplicate_count = 0;
 	for (i = 0; i < ARRAY_SIZE(duplicate_count_by_type); i++)
 		duplicate_count += duplicate_count_by_type[i];
 
 	fprintf(stderr, "%s statistics:\n", argv[0]);
 	fprintf(stderr, "---------------------------------------------------------------------\n");
-	fprintf(stderr, "Alloc'd objects: %10lu (%10lu overflow  )\n", alloc_count, alloc_count - est_obj_cnt);
-	fprintf(stderr, "Total objects:   %10lu (%10lu duplicates                  )\n", object_count, duplicate_count);
-	fprintf(stderr, "      blobs  :   %10lu (%10lu duplicates %10lu deltas)\n", object_count_by_type[OBJ_BLOB], duplicate_count_by_type[OBJ_BLOB], delta_count_by_type[OBJ_BLOB]);
-	fprintf(stderr, "      trees  :   %10lu (%10lu duplicates %10lu deltas)\n", object_count_by_type[OBJ_TREE], duplicate_count_by_type[OBJ_TREE], delta_count_by_type[OBJ_TREE]);
-	fprintf(stderr, "      commits:   %10lu (%10lu duplicates %10lu deltas)\n", object_count_by_type[OBJ_COMMIT], duplicate_count_by_type[OBJ_COMMIT], delta_count_by_type[OBJ_COMMIT]);
-	fprintf(stderr, "      tags   :   %10lu (%10lu duplicates %10lu deltas)\n", object_count_by_type[OBJ_TAG], duplicate_count_by_type[OBJ_TAG], delta_count_by_type[OBJ_TAG]);
+	fprintf(stderr, "Alloc'd objects: %10ju (%10ju overflow  )\n", alloc_count, alloc_count - est_obj_cnt);
+	fprintf(stderr, "Total objects:   %10ju (%10ju duplicates                  )\n", object_count, duplicate_count);
+	fprintf(stderr, "      blobs  :   %10ju (%10ju duplicates %10ju deltas)\n", object_count_by_type[OBJ_BLOB], duplicate_count_by_type[OBJ_BLOB], delta_count_by_type[OBJ_BLOB]);
+	fprintf(stderr, "      trees  :   %10ju (%10ju duplicates %10ju deltas)\n", object_count_by_type[OBJ_TREE], duplicate_count_by_type[OBJ_TREE], delta_count_by_type[OBJ_TREE]);
+	fprintf(stderr, "      commits:   %10ju (%10ju duplicates %10ju deltas)\n", object_count_by_type[OBJ_COMMIT], duplicate_count_by_type[OBJ_COMMIT], delta_count_by_type[OBJ_COMMIT]);
+	fprintf(stderr, "      tags   :   %10ju (%10ju duplicates %10ju deltas)\n", object_count_by_type[OBJ_TAG], duplicate_count_by_type[OBJ_TAG], delta_count_by_type[OBJ_TAG]);
 	fprintf(stderr, "Total branches:  %10lu (%10lu loads     )\n", branch_count, branch_load_count);
-	fprintf(stderr, "      marks:     %10u (%10lu unique    )\n", (1 << marks->shift) * 1024, marks_set_count);
+	fprintf(stderr, "      marks:     %10ju (%10ju unique    )\n", (((uintmax_t)1) << marks->shift) * 1024, marks_set_count);
 	fprintf(stderr, "      atoms:     %10u\n", atom_cnt);
-	fprintf(stderr, "Memory total:    %10lu KiB\n", (total_allocd + alloc_count*sizeof(struct object_entry))/1024);
+	fprintf(stderr, "Memory total:    %10ju KiB\n", (total_allocd + alloc_count*sizeof(struct object_entry))/1024);
 	fprintf(stderr, "       pools:    %10lu KiB\n", total_allocd/1024);
-	fprintf(stderr, "     objects:    %10lu KiB\n", (alloc_count*sizeof(struct object_entry))/1024);
+	fprintf(stderr, "     objects:    %10ju KiB\n", (alloc_count*sizeof(struct object_entry))/1024);
 	fprintf(stderr, "---------------------------------------------------------------------\n");
 	pack_report();
 	fprintf(stderr, "---------------------------------------------------------------------\n");
diff --git a/git-compat-util.h b/git-compat-util.h
index 8781e8e22d..614583e56a 100644
--- a/git-compat-util.h
+++ b/git-compat-util.h
@@ -28,6 +28,7 @@
 #include <string.h>
 #include <errno.h>
 #include <limits.h>
+#include <inttypes.h>
 #include <sys/param.h>
 #include <sys/types.h>
 #include <dirent.h>

From 6cf092619376f5bf60987f146d142497ded2f718 Mon Sep 17 00:00:00 2001
From: "Shawn O. Pearce" <spearce@spearce.org>
Date: Tue, 16 Jan 2007 00:35:41 -0500
Subject: [PATCH 50/81] Replace redundant yread() with read_in_full() in
 fast-import.

Prior to git having read_in_full() fast-import used its own private
function yread to perform the header reading task.  No sense in
keeping that around now that read_in_full is a public, stable
function.

Signed-off-by: Shawn O. Pearce <spearce@spearce.org>
---
 fast-import.c | 16 ++--------------
 1 file changed, 2 insertions(+), 14 deletions(-)

diff --git a/fast-import.c b/fast-import.c
index ebffa7c904..938707c5bd 100644
--- a/fast-import.c
+++ b/fast-import.c
@@ -590,19 +590,6 @@ static void release_tree_entry(struct tree_entry *e)
 	avail_tree_entry = e;
 }
 
-static void yread(int fd, void *buffer, size_t length)
-{
-	ssize_t ret = 0;
-	while (ret < length) {
-		ssize_t size = xread(fd, (char *) buffer + ret, length - ret);
-		if (!size)
-			die("Read from descriptor %i: end of stream", fd);
-		if (size < 0)
-			die("Read from descriptor %i: %s", fd, strerror(errno));
-		ret += size;
-	}
-}
-
 static void start_packfile()
 {
 	struct packed_git *p;
@@ -642,7 +629,8 @@ static void fixup_header_footer()
 		die("Failed seeking to start: %s", strerror(errno));
 
 	SHA1_Init(&c);
-	yread(pack_fd, hdr, 8);
+	if (read_in_full(pack_fd, hdr, 8) != 8)
+		die("Unable to reread header of %s", pack_data->pack_name);
 	SHA1_Update(&c, hdr, 8);
 
 	cnt = htonl(object_count);

From 09543c96bbe41d312bc002c293a193aa328c839d Mon Sep 17 00:00:00 2001
From: "Shawn O. Pearce" <spearce@spearce.org>
Date: Tue, 16 Jan 2007 00:44:48 -0500
Subject: [PATCH 51/81] Reuse sha1 in packed_git in fast-import.

Rather than maintaing our own packfile level sha1 variable we
can make use of the one already available in struct packed_git.
Its meant for the SHA1 of the index but it can also hold the
SHA1 of the packfile itself between final checksumming of the
packfile and creation of the index.

Signed-off-by: Shawn O. Pearce <spearce@spearce.org>
---
 fast-import.c | 9 ++++-----
 1 file changed, 4 insertions(+), 5 deletions(-)

diff --git a/fast-import.c b/fast-import.c
index 938707c5bd..5767e808c6 100644
--- a/fast-import.c
+++ b/fast-import.c
@@ -248,7 +248,6 @@ static struct packed_git *pack_data;
 static struct packed_git **all_packs;
 static int pack_fd;
 static unsigned long pack_size;
-static unsigned char pack_sha1[20];
 
 /* Table of objects we've written. */
 static unsigned int object_entry_alloc = 5000;
@@ -646,8 +645,8 @@ static void fixup_header_footer()
 	}
 	free(buf);
 
-	SHA1_Final(pack_sha1, &c);
-	write_or_die(pack_fd, pack_sha1, sizeof(pack_sha1));
+	SHA1_Final(pack_data->sha1, &c);
+	write_or_die(pack_fd, pack_data->sha1, sizeof(pack_data->sha1));
 }
 
 static int oecmp (const void *a_, const void *b_)
@@ -697,8 +696,8 @@ static void write_index(const char *idx_name)
 		sha1write(f, &offset, 4);
 		sha1write(f, (*c)->sha1, sizeof((*c)->sha1));
 	}
-	sha1write(f, pack_sha1, sizeof(pack_sha1));
-	sha1close(f, NULL, 1);
+	sha1write(f, pack_data->sha1, sizeof(pack_data->sha1));
+	sha1close(f, pack_data->sha1, 1);
 	free(idx);
 }
 

From 8455e48476634eeff6fd2cd4f245cadfef14bbc8 Mon Sep 17 00:00:00 2001
From: "Shawn O. Pearce" <spearce@spearce.org>
Date: Tue, 16 Jan 2007 01:15:31 -0500
Subject: [PATCH 52/81] Use .keep files in fast-import during processing.

Because fast-import automatically updates all references (heads
and tags) at the end of its run the repository is corrupt unless
the objects are available in the .git/objects/pack directory prior
to the refs being modified.  The easiest way to ensure that is true
is to move the packfile and its associated index directly into the
.git/objects/pack directory as soon as we have finished output to it.

But the only safe way to do this is to create the a temporary .keep
file for that pack, so we use the same tricks that index-pack uses
when its being invoked by receive-pack.

Signed-off-by: Shawn O. Pearce <spearce@spearce.org>
---
 fast-import.c | 91 ++++++++++++++++++++++++++++++++++++++++-----------
 1 file changed, 71 insertions(+), 20 deletions(-)

diff --git a/fast-import.c b/fast-import.c
index 5767e808c6..393020504a 100644
--- a/fast-import.c
+++ b/fast-import.c
@@ -241,9 +241,7 @@ static unsigned int atom_cnt;
 static struct atom_str **atom_table;
 
 /* The .pack file being generated */
-static const char *base_name;
 static unsigned int pack_id;
-static char *idx_name;
 static struct packed_git *pack_data;
 static struct packed_git **all_packs;
 static int pack_fd;
@@ -591,17 +589,17 @@ static void release_tree_entry(struct tree_entry *e)
 
 static void start_packfile()
 {
+	static char tmpfile[PATH_MAX];
 	struct packed_git *p;
 	struct pack_header hdr;
 
-	idx_name = xmalloc(strlen(base_name) + 11);
-	p = xcalloc(1, sizeof(*p) + strlen(base_name) + 13);
-	sprintf(p->pack_name, "%s%5.5i.pack", base_name, pack_id + 1);
-	sprintf(idx_name, "%s%5.5i.idx", base_name, pack_id + 1);
-
-	pack_fd = open(p->pack_name, O_RDWR|O_CREAT|O_EXCL, 0666);
+	snprintf(tmpfile, sizeof(tmpfile),
+		"%s/pack_XXXXXX", get_object_directory());
+	pack_fd = mkstemp(tmpfile);
 	if (pack_fd < 0)
-		die("Can't create %s: %s", p->pack_name, strerror(errno));
+		die("Can't create %s: %s", tmpfile, strerror(errno));
+	p = xcalloc(1, sizeof(*p) + strlen(tmpfile) + 2);
+	strcpy(p->pack_name, tmpfile);
 	p->pack_fd = pack_fd;
 
 	hdr.hdr_signature = htonl(PACK_SIGNATURE);
@@ -656,13 +654,15 @@ static int oecmp (const void *a_, const void *b_)
 	return hashcmp(a->sha1, b->sha1);
 }
 
-static void write_index(const char *idx_name)
+static char* create_index()
 {
+	static char tmpfile[PATH_MAX];
+	SHA_CTX ctx;
 	struct sha1file *f;
 	struct object_entry **idx, **c, **last, *e;
 	struct object_entry_pool *o;
 	unsigned int array[256];
-	int i;
+	int i, idx_fd;
 
 	/* Build the sorted table of object IDs. */
 	idx = xmalloc(object_count * sizeof(struct object_entry*));
@@ -689,16 +689,68 @@ static void write_index(const char *idx_name)
 		c = next;
 	}
 
-	f = sha1create("%s", idx_name);
+	snprintf(tmpfile, sizeof(tmpfile),
+		"%s/index_XXXXXX", get_object_directory());
+	idx_fd = mkstemp(tmpfile);
+	if (idx_fd < 0)
+		die("Can't create %s: %s", tmpfile, strerror(errno));
+	f = sha1fd(idx_fd, tmpfile);
 	sha1write(f, array, 256 * sizeof(int));
+	SHA1_Init(&ctx);
 	for (c = idx; c != last; c++) {
 		unsigned int offset = htonl((*c)->offset);
 		sha1write(f, &offset, 4);
 		sha1write(f, (*c)->sha1, sizeof((*c)->sha1));
+		SHA1_Update(&ctx, (*c)->sha1, 20);
 	}
 	sha1write(f, pack_data->sha1, sizeof(pack_data->sha1));
-	sha1close(f, pack_data->sha1, 1);
+	sha1close(f, NULL, 1);
 	free(idx);
+	SHA1_Final(pack_data->sha1, &ctx);
+	return tmpfile;
+}
+
+static char* keep_pack(char *curr_index_name)
+{
+	static char name[PATH_MAX];
+	static char *keep_msg = "fast-import";
+	int keep_fd;
+
+	chmod(pack_data->pack_name, 0444);
+	chmod(curr_index_name, 0444);
+
+	snprintf(name, sizeof(name), "%s/pack/pack-%s.keep",
+		 get_object_directory(), sha1_to_hex(pack_data->sha1));
+	keep_fd = open(name, O_RDWR|O_CREAT|O_EXCL, 0600);
+	if (keep_fd < 0)
+		die("cannot create keep file");
+	write(keep_fd, keep_msg, strlen(keep_msg));
+	close(keep_fd);
+
+	snprintf(name, sizeof(name), "%s/pack/pack-%s.pack",
+		 get_object_directory(), sha1_to_hex(pack_data->sha1));
+	if (move_temp_to_file(pack_data->pack_name, name))
+		die("cannot store pack file");
+	printf("%s\n", name);
+
+	snprintf(name, sizeof(name), "%s/pack/pack-%s.idx",
+		 get_object_directory(), sha1_to_hex(pack_data->sha1));
+	if (move_temp_to_file(curr_index_name, name))
+		die("cannot store index file");
+	return name;
+}
+
+static void unkeep_all_packs()
+{
+	static char name[PATH_MAX];
+	int k;
+
+	for (k = 0; k < pack_id; k++) {
+		struct packed_git *p = all_packs[k];
+		snprintf(name, sizeof(name), "%s/pack/pack-%s.keep",
+			 get_object_directory(), sha1_to_hex(p->sha1));
+		unlink(name);
+	}
 }
 
 static void end_packfile()
@@ -706,10 +758,10 @@ static void end_packfile()
 	struct packed_git *old_p = pack_data, *new_p;
 
 	if (object_count) {
+		char *idx_name;
+
 		fixup_header_footer();
-		write_index(idx_name);
-		fprintf(stdout, "%s\n", old_p->pack_name);
-		fflush(stdout);
+		idx_name = keep_pack(create_index());
 
 		/* Register the packfile with core git's machinary. */
 		new_p = add_packed_git(idx_name, strlen(idx_name), 1);
@@ -725,7 +777,6 @@ static void end_packfile()
 		unlink(old_p->pack_name);
 	}
 	free(old_p);
-	free(idx_name);
 
 	/* We can't carry a delta across packfiles. */
 	free(last_blob.data);
@@ -1790,7 +1841,7 @@ static void cmd_checkpoint()
 }
 
 static const char fast_import_usage[] =
-"git-fast-import [--objects=n] [--depth=n] [--active-branches=n] [--export-marks=marks.file] [--branch-log=log] temp.pack";
+"git-fast-import [--objects=n] [--depth=n] [--active-branches=n] [--export-marks=marks.file] [--branch-log=log]";
 
 int main(int argc, const char **argv)
 {
@@ -1826,9 +1877,8 @@ int main(int argc, const char **argv)
 		else
 			die("unknown option %s", a);
 	}
-	if ((i+1) != argc)
+	if (i != argc)
 		usage(fast_import_usage);
-	base_name = argv[i];
 
 	alloc_objects(est_obj_cnt);
 	strbuf_init(&command_buf);
@@ -1860,6 +1910,7 @@ int main(int argc, const char **argv)
 
 	dump_branches();
 	dump_tags();
+	unkeep_all_packs();
 	dump_marks();
 	if (branch_log)
 		fclose(branch_log);

From 1280158738333109cf0ada2fb378db2cdf7296ad Mon Sep 17 00:00:00 2001
From: "Shawn O. Pearce" <spearce@spearce.org>
Date: Tue, 16 Jan 2007 01:17:47 -0500
Subject: [PATCH 53/81] Ensure we close the packfile after creating it in
 fast-import.

Because we are renaming the packfile into its file destination we
need to be sure its not open when the rename is called, otherwise
some operating systems (e.g. Windows) may prevent the rename from
occurring.

Signed-off-by: Shawn O. Pearce <spearce@spearce.org>
---
 fast-import.c | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

diff --git a/fast-import.c b/fast-import.c
index 393020504a..a9cf22dfe2 100644
--- a/fast-import.c
+++ b/fast-import.c
@@ -645,6 +645,7 @@ static void fixup_header_footer()
 
 	SHA1_Final(pack_data->sha1, &c);
 	write_or_die(pack_fd, pack_data->sha1, sizeof(pack_data->sha1));
+	close(pack_fd);
 }
 
 static int oecmp (const void *a_, const void *b_)
@@ -768,14 +769,11 @@ static void end_packfile()
 		if (!new_p)
 			die("core git rejected index %s", idx_name);
 		new_p->windows = old_p->windows;
-		new_p->pack_fd = old_p->pack_fd;
 		all_packs[pack_id++] = new_p;
 		install_packed_git(new_p);
 	}
-	else {
-		close(pack_fd);
+	else
 		unlink(old_p->pack_name);
-	}
 	free(old_p);
 
 	/* We can't carry a delta across packfiles. */

From 0fcbcae75372f96539ba0f9598112c417d81ab0d Mon Sep 17 00:00:00 2001
From: "Shawn O. Pearce" <spearce@spearce.org>
Date: Tue, 16 Jan 2007 01:20:57 -0500
Subject: [PATCH 54/81] Remove unnecessary pack_fd global in fast-import.

Much like the pack_sha1 the pack_fd is an unnecessary global
variable, we already have the fd stored in our struct packed_git
*pack_data so that the core library functions in sha1_file.c are
able to lookup and decompress object data that we have previously
written.  Keeping an extra copy of this value in our own variable
is just a hold-over from earlier versions of fast-import and is
now completely unnecessary.

Signed-off-by: Shawn O. Pearce <spearce@spearce.org>
---
 fast-import.c | 13 +++++++------
 1 file changed, 7 insertions(+), 6 deletions(-)

diff --git a/fast-import.c b/fast-import.c
index a9cf22dfe2..281b8f6a5e 100644
--- a/fast-import.c
+++ b/fast-import.c
@@ -244,7 +244,6 @@ static struct atom_str **atom_table;
 static unsigned int pack_id;
 static struct packed_git *pack_data;
 static struct packed_git **all_packs;
-static int pack_fd;
 static unsigned long pack_size;
 
 /* Table of objects we've written. */
@@ -592,6 +591,7 @@ static void start_packfile()
 	static char tmpfile[PATH_MAX];
 	struct packed_git *p;
 	struct pack_header hdr;
+	int pack_fd;
 
 	snprintf(tmpfile, sizeof(tmpfile),
 		"%s/pack_XXXXXX", get_object_directory());
@@ -605,7 +605,7 @@ static void start_packfile()
 	hdr.hdr_signature = htonl(PACK_SIGNATURE);
 	hdr.hdr_version = htonl(2);
 	hdr.hdr_entries = 0;
-	write_or_die(pack_fd, &hdr, sizeof(hdr));
+	write_or_die(p->pack_fd, &hdr, sizeof(hdr));
 
 	pack_data = p;
 	pack_size = sizeof(hdr);
@@ -617,6 +617,7 @@ static void start_packfile()
 
 static void fixup_header_footer()
 {
+	int pack_fd = pack_data->pack_fd;
 	SHA_CTX c;
 	char hdr[8];
 	unsigned long cnt;
@@ -912,23 +913,23 @@ static int store_object(
 		last->depth++;
 
 		hdrlen = encode_header(OBJ_OFS_DELTA, deltalen, hdr);
-		write_or_die(pack_fd, hdr, hdrlen);
+		write_or_die(pack_data->pack_fd, hdr, hdrlen);
 		pack_size += hdrlen;
 
 		hdr[pos] = ofs & 127;
 		while (ofs >>= 7)
 			hdr[--pos] = 128 | (--ofs & 127);
-		write_or_die(pack_fd, hdr + pos, sizeof(hdr) - pos);
+		write_or_die(pack_data->pack_fd, hdr + pos, sizeof(hdr) - pos);
 		pack_size += sizeof(hdr) - pos;
 	} else {
 		if (last)
 			last->depth = 0;
 		hdrlen = encode_header(type, datlen, hdr);
-		write_or_die(pack_fd, hdr, hdrlen);
+		write_or_die(pack_data->pack_fd, hdr, hdrlen);
 		pack_size += hdrlen;
 	}
 
-	write_or_die(pack_fd, out, s.total_out);
+	write_or_die(pack_data->pack_fd, out, s.total_out);
 	pack_size += s.total_out;
 
 	free(out);

From eec11c24840bfc5293a80fed3c3b1e5bc10ac453 Mon Sep 17 00:00:00 2001
From: "Shawn O. Pearce" <spearce@spearce.org>
Date: Tue, 16 Jan 2007 04:25:12 -0500
Subject: [PATCH 55/81] Correct max_packsize default in fast-import.

Apparently amd64 has defined 'unsigned long' to be a 64 bit value,
which means -1 was way over the 4 GiB packfile limit.  Whoops.

Signed-off-by: Shawn O. Pearce <spearce@spearce.org>
---
 fast-import.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/fast-import.c b/fast-import.c
index 281b8f6a5e..8342314bb0 100644
--- a/fast-import.c
+++ b/fast-import.c
@@ -217,7 +217,7 @@ struct hash_list
 
 /* Configured limits on output */
 static unsigned long max_depth = 10;
-static unsigned long max_packsize = -1;
+static unsigned long max_packsize = (1LL << 32) - 1;
 static uintmax_t max_objects = -1;
 
 /* Stats and misc. counters */

From a7ddc48765ff2e4f6601ea146cba4283a342e0b1 Mon Sep 17 00:00:00 2001
From: "Shawn O. Pearce" <spearce@spearce.org>
Date: Tue, 16 Jan 2007 04:55:41 -0500
Subject: [PATCH 56/81] Correct object_count type and stat output in
 fast-import.

Since object_count is limited to 'unsigned long' (really an
unsigned 32 bit integer value) by the pack file format we may as
well use exactly that type here in fast-import for that counter.
An earlier change by me incorrectly made it uintmax_t.

But since object_count is a counter for the current packfile only,
we don't want to output its value at the end.  Instead we should
sum up the individual type counters and report that total, as that
will cover all of the packfiles.

Signed-off-by: Shawn O. Pearce <spearce@spearce.org>
---
 fast-import.c | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

diff --git a/fast-import.c b/fast-import.c
index 8342314bb0..3992af5f25 100644
--- a/fast-import.c
+++ b/fast-import.c
@@ -222,11 +222,11 @@ static uintmax_t max_objects = -1;
 
 /* Stats and misc. counters */
 static uintmax_t alloc_count;
-static uintmax_t object_count;
 static uintmax_t marks_set_count;
 static uintmax_t object_count_by_type[1 << TYPE_BITS];
 static uintmax_t duplicate_count_by_type[1 << TYPE_BITS];
 static uintmax_t delta_count_by_type[1 << TYPE_BITS];
+static unsigned long object_count;
 static unsigned long branch_count;
 static unsigned long branch_load_count;
 
@@ -1846,7 +1846,7 @@ int main(int argc, const char **argv)
 {
 	int i;
 	uintmax_t est_obj_cnt = object_entry_alloc;
-	uintmax_t duplicate_count;
+	uintmax_t total_count, duplicate_count;
 
 	setup_ident();
 	git_config(git_default_config);
@@ -1914,6 +1914,9 @@ int main(int argc, const char **argv)
 	if (branch_log)
 		fclose(branch_log);
 
+	total_count = 0;
+	for (i = 0; i < ARRAY_SIZE(object_count_by_type); i++)
+		total_count += object_count_by_type[i];
 	duplicate_count = 0;
 	for (i = 0; i < ARRAY_SIZE(duplicate_count_by_type); i++)
 		duplicate_count += duplicate_count_by_type[i];
@@ -1921,7 +1924,7 @@ int main(int argc, const char **argv)
 	fprintf(stderr, "%s statistics:\n", argv[0]);
 	fprintf(stderr, "---------------------------------------------------------------------\n");
 	fprintf(stderr, "Alloc'd objects: %10ju (%10ju overflow  )\n", alloc_count, alloc_count - est_obj_cnt);
-	fprintf(stderr, "Total objects:   %10ju (%10ju duplicates                  )\n", object_count, duplicate_count);
+	fprintf(stderr, "Total objects:   %10ju (%10ju duplicates                  )\n", total_count, duplicate_count);
 	fprintf(stderr, "      blobs  :   %10ju (%10ju duplicates %10ju deltas)\n", object_count_by_type[OBJ_BLOB], duplicate_count_by_type[OBJ_BLOB], delta_count_by_type[OBJ_BLOB]);
 	fprintf(stderr, "      trees  :   %10ju (%10ju duplicates %10ju deltas)\n", object_count_by_type[OBJ_TREE], duplicate_count_by_type[OBJ_TREE], delta_count_by_type[OBJ_TREE]);
 	fprintf(stderr, "      commits:   %10ju (%10ju duplicates %10ju deltas)\n", object_count_by_type[OBJ_COMMIT], duplicate_count_by_type[OBJ_COMMIT], delta_count_by_type[OBJ_COMMIT]);

From 2369ed79071edf0f040eb2c280e1e2cf9a883bb9 Mon Sep 17 00:00:00 2001
From: "Shawn O. Pearce" <spearce@spearce.org>
Date: Tue, 16 Jan 2007 16:18:44 -0500
Subject: [PATCH 57/81] Print out the edge commits for each packfile in
 fast-import.

To help callers repack very large repositories into a series of
packfiles fast-import now outputs the last commits/tags it wrote to
a packfile when it prints out the packfile name.  This information
can be feed to pack-objects --revs to repack.  For the first pack
of an initial import this is pretty easy (just feed those SHA1s on
stdin) but for subsequent packs you want to feed the subsequent
pack's final SHA1s but also all prior pack's SHA1s prefixed with
the negation operator.  This way the prior pack's data does not
get included into the subsequent pack.

Signed-off-by: Shawn O. Pearce <spearce@spearce.org>
---
 fast-import.c | 26 ++++++++++++++++++++++++--
 1 file changed, 24 insertions(+), 2 deletions(-)

diff --git a/fast-import.c b/fast-import.c
index 3992af5f25..84dfde9d2f 100644
--- a/fast-import.c
+++ b/fast-import.c
@@ -193,6 +193,7 @@ struct branch
 	const char *name;
 	unsigned long last_commit;
 	struct tree_entry branch_tree;
+	unsigned int pack_id;
 	unsigned char sha1[20];
 };
 
@@ -200,6 +201,7 @@ struct tag
 {
 	struct tag *next_tag;
 	const char *name;
+	unsigned int pack_id;
 	unsigned char sha1[20];
 };
 
@@ -733,7 +735,6 @@ static char* keep_pack(char *curr_index_name)
 		 get_object_directory(), sha1_to_hex(pack_data->sha1));
 	if (move_temp_to_file(pack_data->pack_name, name))
 		die("cannot store pack file");
-	printf("%s\n", name);
 
 	snprintf(name, sizeof(name), "%s/pack/pack-%s.idx",
 		 get_object_directory(), sha1_to_hex(pack_data->sha1));
@@ -761,6 +762,9 @@ static void end_packfile()
 
 	if (object_count) {
 		char *idx_name;
+		int i;
+		struct branch *b;
+		struct tag *t;
 
 		fixup_header_footer();
 		idx_name = keep_pack(create_index());
@@ -770,8 +774,24 @@ static void end_packfile()
 		if (!new_p)
 			die("core git rejected index %s", idx_name);
 		new_p->windows = old_p->windows;
-		all_packs[pack_id++] = new_p;
+		all_packs[pack_id] = new_p;
 		install_packed_git(new_p);
+
+		/* Print the boundary */
+		fprintf(stdout, "%s:", new_p->pack_name);
+		for (i = 0; i < branch_table_sz; i++) {
+			for (b = branch_table[i]; b; b = b->table_next_branch) {
+				if (b->pack_id == pack_id)
+					fprintf(stdout, " %s", sha1_to_hex(b->sha1));
+			}
+		}
+		for (t = first_tag; t; t = t->next_tag) {
+			if (t->pack_id == pack_id)
+				fprintf(stdout, " %s", sha1_to_hex(t->sha1));
+		}
+		fputc('\n', stdout);
+
+		pack_id++;
 	}
 	else
 		unlink(old_p->pack_name);
@@ -1679,6 +1699,7 @@ static void cmd_new_commit()
 		new_data.buffer, sp - (char*)new_data.buffer,
 		NULL, b->sha1, next_mark);
 	b->last_commit = object_count_by_type[OBJ_COMMIT];
+	b->pack_id = pack_id;
 
 	if (branch_log) {
 		int need_dq = quote_c_style(b->name, NULL, NULL, 0);
@@ -1787,6 +1808,7 @@ static void cmd_new_tag()
 
 	store_object(OBJ_TAG, new_data.buffer, sp - (char*)new_data.buffer,
 		NULL, t->sha1, 0);
+	t->pack_id = pack_id;
 
 	if (branch_log) {
 		int need_dq = quote_c_style(t->name, NULL, NULL, 0);

From 2104838bf9b97066f21e4c32efdfa424d41e6b98 Mon Sep 17 00:00:00 2001
From: "Shawn O. Pearce" <spearce@spearce.org>
Date: Wed, 17 Jan 2007 00:33:18 -0500
Subject: [PATCH 58/81] Corrected BNF input documentation for fast-import.

Now that fast-import uses uintmax_t (the largest available unsigned
integer type) for marks we don't want to say its an unsigned 32
bit integer in ASCII base 10 notation.  It could be much larger,
especially on 64 bit systems, and especially if a frontend uses
a very large number of marks (1 per file revision on a very, very
large import).

Signed-off-by: Shawn O. Pearce <spearce@spearce.org>
---
 fast-import.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/fast-import.c b/fast-import.c
index 84dfde9d2f..f1b26d103f 100644
--- a/fast-import.c
+++ b/fast-import.c
@@ -72,6 +72,7 @@ Format of STDIN stream:
   path_str    ::= path    | '"' quoted(path)    '"' ;
 
   declen ::= # unsigned 32 bit value, ascii base10 notation;
+  bigint ::= # unsigned integer value, ascii base10 notation;
   binary_data ::= # file content, not interpreted;
 
   sp ::= # ASCII space character;
@@ -81,7 +82,7 @@ Format of STDIN stream:
 	 # an idnum.  This is to distinguish it from a ref or tag name as
      # GIT does not permit ':' in ref or tag strings.
 	 #
-  idnum   ::= ':' declen;
+  idnum   ::= ':' bigint;
   path    ::= # GIT style file path, e.g. "a/b/c";
   ref     ::= # GIT ref name, e.g. "refs/heads/MOZ_GECKO_EXPERIMENT";
   tag     ::= # GIT tag name, e.g. "FIREFOX_1_5";

From 6f64f6d9d2b12cdae1648cbf536685c888f3b981 Mon Sep 17 00:00:00 2001
From: "Shawn O. Pearce" <spearce@spearce.org>
Date: Wed, 17 Jan 2007 00:57:23 -0500
Subject: [PATCH 59/81] Correct a few types to be unsigned in fast-import.

The length of an atom string cannot be negative.  So make it
explicit and declare it as an unsigned value.

The shift width in a mark table node also cannot be negative.
I'm also moving it to after the pointer arrays to prevent any
possible alignment problems on a 64 bit system.

Signed-off-by: Shawn O. Pearce <spearce@spearce.org>
---
 fast-import.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/fast-import.c b/fast-import.c
index f1b26d103f..2c500d6be3 100644
--- a/fast-import.c
+++ b/fast-import.c
@@ -130,11 +130,11 @@ struct object_entry_pool
 
 struct mark_set
 {
-	int shift;
 	union {
 		struct object_entry *marked[1024];
 		struct mark_set *sets[1024];
 	} data;
+	unsigned int shift;
 };
 
 struct last_object
@@ -157,7 +157,7 @@ struct mem_pool
 struct atom_str
 {
 	struct atom_str *next_atom;
-	int str_len;
+	unsigned int str_len;
 	char str_dat[FLEX_ARRAY]; /* more */
 };
 
@@ -192,8 +192,8 @@ struct branch
 	struct branch *table_next_branch;
 	struct branch *active_next_branch;
 	const char *name;
-	unsigned long last_commit;
 	struct tree_entry branch_tree;
+	unsigned long last_commit;
 	unsigned int pack_id;
 	unsigned char sha1[20];
 };

From fd99224eec67d89f970b207e7db031b7c58e812e Mon Sep 17 00:00:00 2001
From: "Shawn O. Pearce" <spearce@spearce.org>
Date: Wed, 17 Jan 2007 01:47:25 -0500
Subject: [PATCH 60/81] Declare no-arg functions as (void) in fast-import.

Apparently the git convention is to declare any function which
takes no arguments as taking void.  I did not do this during the
early fast-import development, but should have.

Signed-off-by: Shawn O. Pearce <spearce@spearce.org>
---
 fast-import.c | 36 ++++++++++++++++++------------------
 1 file changed, 18 insertions(+), 18 deletions(-)

diff --git a/fast-import.c b/fast-import.c
index 2c500d6be3..84f855fb8e 100644
--- a/fast-import.c
+++ b/fast-import.c
@@ -561,7 +561,7 @@ static struct tree_content* grow_tree_content(
 	return r;
 }
 
-static struct tree_entry* new_tree_entry()
+static struct tree_entry* new_tree_entry(void)
 {
 	struct tree_entry *e;
 
@@ -589,7 +589,7 @@ static void release_tree_entry(struct tree_entry *e)
 	avail_tree_entry = e;
 }
 
-static void start_packfile()
+static void start_packfile(void)
 {
 	static char tmpfile[PATH_MAX];
 	struct packed_git *p;
@@ -618,7 +618,7 @@ static void start_packfile()
 	all_packs[pack_id] = p;
 }
 
-static void fixup_header_footer()
+static void fixup_header_footer(void)
 {
 	int pack_fd = pack_data->pack_fd;
 	SHA_CTX c;
@@ -659,7 +659,7 @@ static int oecmp (const void *a_, const void *b_)
 	return hashcmp(a->sha1, b->sha1);
 }
 
-static char* create_index()
+static char* create_index(void)
 {
 	static char tmpfile[PATH_MAX];
 	SHA_CTX ctx;
@@ -744,7 +744,7 @@ static char* keep_pack(char *curr_index_name)
 	return name;
 }
 
-static void unkeep_all_packs()
+static void unkeep_all_packs(void)
 {
 	static char name[PATH_MAX];
 	int k;
@@ -757,7 +757,7 @@ static void unkeep_all_packs()
 	}
 }
 
-static void end_packfile()
+static void end_packfile(void)
 {
 	struct packed_git *old_p = pack_data, *new_p;
 
@@ -806,7 +806,7 @@ static void end_packfile()
 	last_blob.depth = 0;
 }
 
-static void checkpoint()
+static void checkpoint(void)
 {
 	end_packfile();
 	start_packfile();
@@ -1253,7 +1253,7 @@ del_entry:
 	return 1;
 }
 
-static void dump_branches()
+static void dump_branches(void)
 {
 	static const char *msg = "fast-import";
 	unsigned int i;
@@ -1269,7 +1269,7 @@ static void dump_branches()
 	}
 }
 
-static void dump_tags()
+static void dump_tags(void)
 {
 	static const char *msg = "fast-import";
 	struct tag *t;
@@ -1304,7 +1304,7 @@ static void dump_marks_helper(FILE *f,
 	}
 }
 
-static void dump_marks()
+static void dump_marks(void)
 {
 	if (mark_file)
 	{
@@ -1314,12 +1314,12 @@ static void dump_marks()
 	}
 }
 
-static void read_next_command()
+static void read_next_command(void)
 {
 	read_line(&command_buf, stdin, '\n');
 }
 
-static void cmd_mark()
+static void cmd_mark(void)
 {
 	if (!strncmp("mark :", command_buf.buf, 6)) {
 		next_mark = strtoumax(command_buf.buf + 6, NULL, 10);
@@ -1355,7 +1355,7 @@ static void* cmd_data (size_t *size)
 	return buffer;
 }
 
-static void cmd_new_blob()
+static void cmd_new_blob(void)
 {
 	size_t l;
 	void *d;
@@ -1368,7 +1368,7 @@ static void cmd_new_blob()
 		free(d);
 }
 
-static void unload_one_branch()
+static void unload_one_branch(void)
 {
 	while (cur_active_branches
 		&& cur_active_branches >= max_active_branches) {
@@ -1601,7 +1601,7 @@ static struct hash_list* cmd_merge(unsigned int *count)
 	return list;
 }
 
-static void cmd_new_commit()
+static void cmd_new_commit(void)
 {
 	struct branch *b;
 	void *msg;
@@ -1715,7 +1715,7 @@ static void cmd_new_commit()
 	}
 }
 
-static void cmd_new_tag()
+static void cmd_new_tag(void)
 {
 	char *str_uq;
 	const char *endp;
@@ -1824,7 +1824,7 @@ static void cmd_new_tag()
 	}
 }
 
-static void cmd_reset_branch()
+static void cmd_reset_branch(void)
 {
 	struct branch *b;
 	char *str_uq;
@@ -1855,7 +1855,7 @@ static void cmd_reset_branch()
 	cmd_from(b);
 }
 
-static void cmd_checkpoint()
+static void cmd_checkpoint(void)
 {
 	if (object_count)
 		checkpoint();

From 69e74e7412603dd536695c3d6a397673e8ae2bd2 Mon Sep 17 00:00:00 2001
From: "Shawn O. Pearce" <spearce@spearce.org>
Date: Wed, 17 Jan 2007 02:42:43 -0500
Subject: [PATCH 61/81] Correct packfile edge output in fast-import.

Branches are only contained by a packfile if the branch actually
had its most recent commit in that packfile.  So new branches are
set to MAX_PACK_ID to ensure they don't cause their commit to list
as part of the first packfile when it closes out if the commit was
actually in existance before fast-import started.

Also corrected the type of last_commit to be umaxint_t to prevent
overflow and wraparound on very large imports.  Though that is
highly unlikely to occur as we're talking 4 billion commits, which
no real project has right now.

Signed-off-by: Shawn O. Pearce <spearce@spearce.org>
---
 fast-import.c | 23 +++++++++++++++--------
 1 file changed, 15 insertions(+), 8 deletions(-)

diff --git a/fast-import.c b/fast-import.c
index 84f855fb8e..a3073c5f03 100644
--- a/fast-import.c
+++ b/fast-import.c
@@ -111,12 +111,15 @@ Format of STDIN stream:
 #include "strbuf.h"
 #include "quote.h"
 
+#define PACK_ID_BITS 16
+#define MAX_PACK_ID ((1<<PACK_ID_BITS)-1)
+
 struct object_entry
 {
 	struct object_entry *next;
 	unsigned long offset;
 	unsigned type : TYPE_BITS;
-	unsigned pack_id : 16;
+	unsigned pack_id : PACK_ID_BITS;
 	unsigned char sha1[20];
 };
 
@@ -193,7 +196,7 @@ struct branch
 	struct branch *active_next_branch;
 	const char *name;
 	struct tree_entry branch_tree;
-	unsigned long last_commit;
+	uintmax_t last_commit;
 	unsigned int pack_id;
 	unsigned char sha1[20];
 };
@@ -494,6 +497,7 @@ static struct branch* new_branch(const char *name)
 	b->table_next_branch = branch_table[hc];
 	b->branch_tree.versions[0].mode = S_IFDIR;
 	b->branch_tree.versions[1].mode = S_IFDIR;
+	b->pack_id = MAX_PACK_ID;
 	branch_table[hc] = b;
 	branch_count++;
 	return b;
@@ -1696,11 +1700,11 @@ static void cmd_new_commit(void)
 	free(committer);
 	free(msg);
 
-	store_object(OBJ_COMMIT,
+	if (!store_object(OBJ_COMMIT,
 		new_data.buffer, sp - (char*)new_data.buffer,
-		NULL, b->sha1, next_mark);
+		NULL, b->sha1, next_mark))
+		b->pack_id = pack_id;
 	b->last_commit = object_count_by_type[OBJ_COMMIT];
-	b->pack_id = pack_id;
 
 	if (branch_log) {
 		int need_dq = quote_c_style(b->name, NULL, NULL, 0);
@@ -1807,9 +1811,12 @@ static void cmd_new_tag(void)
 	free(tagger);
 	free(msg);
 
-	store_object(OBJ_TAG, new_data.buffer, sp - (char*)new_data.buffer,
-		NULL, t->sha1, 0);
-	t->pack_id = pack_id;
+	if (store_object(OBJ_TAG, new_data.buffer,
+		sp - (char*)new_data.buffer,
+		NULL, t->sha1, 0))
+		t->pack_id = MAX_PACK_ID;
+	else
+		t->pack_id = pack_id;
 
 	if (branch_log) {
 		int need_dq = quote_c_style(t->name, NULL, NULL, 0);

From 566f44252b00003d1f4e7baaaf709d74bf73770f Mon Sep 17 00:00:00 2001
From: "Shawn O. Pearce" <spearce@spearce.org>
Date: Thu, 18 Jan 2007 11:26:06 -0500
Subject: [PATCH 62/81] Always use struct pack_header for pack header in
 fast-import.

Previously we were using 'unsigned int' to update the hdr_entries
field of the pack header after the file had been completed and
was being hashed.  This may not be 32 bits on all platforms.
Instead we want to always uint32_t.

I'm actually cheating here by just using the pack_header like the
rest of Git and letting the struct definition declare the correct
type.  Right now that field is still 'unsigned int' (wrong) but a
pending change submitted by Simon 'corecode' Schubert changes it
to uint32_t.  After that change is merged in fast-import will do
the right thing all of the time.

Signed-off-by: Shawn O. Pearce <spearce@spearce.org>
---
 fast-import.c | 26 ++++++++++++++------------
 1 file changed, 14 insertions(+), 12 deletions(-)

diff --git a/fast-import.c b/fast-import.c
index a3073c5f03..fb7d912eff 100644
--- a/fast-import.c
+++ b/fast-import.c
@@ -624,29 +624,31 @@ static void start_packfile(void)
 
 static void fixup_header_footer(void)
 {
+	static const int buf_sz = 128 * 1024;
 	int pack_fd = pack_data->pack_fd;
 	SHA_CTX c;
-	char hdr[8];
-	unsigned long cnt;
+	struct pack_header hdr;
 	char *buf;
 
 	if (lseek(pack_fd, 0, SEEK_SET) != 0)
 		die("Failed seeking to start: %s", strerror(errno));
+	if (read_in_full(pack_fd, &hdr, sizeof(hdr)) != sizeof(hdr))
+		die("Unable to reread header of %s", pack_data->pack_name);
+	if (lseek(pack_fd, 0, SEEK_SET) != 0)
+		die("Failed seeking to start: %s", strerror(errno));
+	hdr.hdr_entries = htonl(object_count);
+	write_or_die(pack_fd, &hdr, sizeof(hdr));
 
 	SHA1_Init(&c);
-	if (read_in_full(pack_fd, hdr, 8) != 8)
-		die("Unable to reread header of %s", pack_data->pack_name);
-	SHA1_Update(&c, hdr, 8);
+	SHA1_Update(&c, &hdr, sizeof(hdr));
 
-	cnt = htonl(object_count);
-	SHA1_Update(&c, &cnt, 4);
-	write_or_die(pack_fd, &cnt, 4);
-
-	buf = xmalloc(128 * 1024);
+	buf = xmalloc(buf_sz);
 	for (;;) {
-		size_t n = xread(pack_fd, buf, 128 * 1024);
-		if (n <= 0)
+		size_t n = xread(pack_fd, buf, buf_sz);
+		if (!n)
 			break;
+		if (n < 0)
+			die("Failed to checksum %s", pack_data->pack_name);
 		SHA1_Update(&c, buf, n);
 	}
 	free(buf);

From ebea9dd4f1b62cb3c8302f10aaca3af0231e9818 Mon Sep 17 00:00:00 2001
From: "Shawn O. Pearce" <spearce@spearce.org>
Date: Thu, 18 Jan 2007 11:30:17 -0500
Subject: [PATCH 63/81] Use fixed-size integers when writing out the index in
 fast-import.

Currently the pack .idx file format uses 32-bit unsigned integers
for the fan-out table and the object offsets.  We had previously
defined these as 'unsigned int', but not every system will define
that type to be a 32 bit value.  To ensure maximum portability we
should always use 'uint32_t'.

Signed-off-by: Shawn O. Pearce <spearce@spearce.org>
---
 fast-import.c     | 4 ++--
 git-compat-util.h | 1 +
 2 files changed, 3 insertions(+), 2 deletions(-)

diff --git a/fast-import.c b/fast-import.c
index fb7d912eff..7f519b4de3 100644
--- a/fast-import.c
+++ b/fast-import.c
@@ -672,7 +672,7 @@ static char* create_index(void)
 	struct sha1file *f;
 	struct object_entry **idx, **c, **last, *e;
 	struct object_entry_pool *o;
-	unsigned int array[256];
+	uint32_t array[256];
 	int i, idx_fd;
 
 	/* Build the sorted table of object IDs. */
@@ -709,7 +709,7 @@ static char* create_index(void)
 	sha1write(f, array, 256 * sizeof(int));
 	SHA1_Init(&ctx);
 	for (c = idx; c != last; c++) {
-		unsigned int offset = htonl((*c)->offset);
+		uint32_t offset = htonl((*c)->offset);
 		sha1write(f, &offset, 4);
 		sha1write(f, (*c)->sha1, sizeof((*c)->sha1));
 		SHA1_Update(&ctx, (*c)->sha1, 20);
diff --git a/git-compat-util.h b/git-compat-util.h
index 614583e56a..ac06963e8d 100644
--- a/git-compat-util.h
+++ b/git-compat-util.h
@@ -46,6 +46,7 @@
 #include <arpa/inet.h>
 #include <netdb.h>
 #include <pwd.h>
+#include <stdint.h>
 #include <grp.h>
 
 #ifndef NO_ICONV

From e5808826c4abe183b4db9bae8f13445624696f66 Mon Sep 17 00:00:00 2001
From: "Shawn O. Pearce" <spearce@spearce.org>
Date: Thu, 18 Jan 2007 12:00:49 -0500
Subject: [PATCH 64/81] Remove unnecessary options from fast-import.

The --objects command line option is rather unnecessary.  Internally
we allocate objects in 5000 unit blocks, ensuring that any sort
of malloc overhead is ammortized over the individual objects to
almost nothing.  Since most frontends don't know how many objects
they will need for a given import run (and its hard for them to
predict without just doing the run) we probably won't see anyone
using --objects.  Further since there's really no major benefit
to using the option, most frontends won't even bother supplying
it even if they could estimate the number of objects.  So I'm
removing it.

The --max-objects-per-pack option was probably a mistake to even
have added in the first place.  The packfile format is limited
to 4 GiB today; given that objects need at least 3 bytes of data
(and probably need even more) there's no way we are going to exceed
the limit of 1<<32-1 objects before we reach the file size limit.
So I'm removing it (to slightly reduce the complexity of the code)
before anyone gets any wise ideas and tries to use it.

Signed-off-by: Shawn O. Pearce <spearce@spearce.org>
---
 fast-import.c | 16 ++++------------
 1 file changed, 4 insertions(+), 12 deletions(-)

diff --git a/fast-import.c b/fast-import.c
index 7f519b4de3..9a642f2e02 100644
--- a/fast-import.c
+++ b/fast-import.c
@@ -224,7 +224,6 @@ struct hash_list
 /* Configured limits on output */
 static unsigned long max_depth = 10;
 static unsigned long max_packsize = (1LL << 32) - 1;
-static uintmax_t max_objects = -1;
 
 /* Stats and misc. counters */
 static uintmax_t alloc_count;
@@ -900,9 +899,7 @@ static int store_object(
 	deflateEnd(&s);
 
 	/* Determine if we should auto-checkpoint. */
-	if ((object_count + 1) > max_objects
-		|| (object_count + 1) < object_count
-		|| (pack_size + 60 + s.total_out) > max_packsize
+	if ((pack_size + 60 + s.total_out) > max_packsize
 		|| (pack_size + 60 + s.total_out) < pack_size) {
 
 		/* This new object needs to *not* have the current pack_id. */
@@ -1872,12 +1869,11 @@ static void cmd_checkpoint(void)
 }
 
 static const char fast_import_usage[] =
-"git-fast-import [--objects=n] [--depth=n] [--active-branches=n] [--export-marks=marks.file] [--branch-log=log]";
+"git-fast-import [--depth=n] [--active-branches=n] [--export-marks=marks.file] [--branch-log=log]";
 
 int main(int argc, const char **argv)
 {
 	int i;
-	uintmax_t est_obj_cnt = object_entry_alloc;
 	uintmax_t total_count, duplicate_count;
 
 	setup_ident();
@@ -1888,10 +1884,6 @@ int main(int argc, const char **argv)
 
 		if (*a != '-' || !strcmp(a, "--"))
 			break;
-		else if (!strncmp(a, "--objects=", 10))
-			est_obj_cnt = strtoumax(a + 10, NULL, 0);
-		else if (!strncmp(a, "--max-objects-per-pack=", 23))
-			max_objects = strtoumax(a + 23, NULL, 0);
 		else if (!strncmp(a, "--max-pack-size=", 16))
 			max_packsize = strtoumax(a + 16, NULL, 0) * 1024 * 1024;
 		else if (!strncmp(a, "--depth=", 8))
@@ -1911,7 +1903,7 @@ int main(int argc, const char **argv)
 	if (i != argc)
 		usage(fast_import_usage);
 
-	alloc_objects(est_obj_cnt);
+	alloc_objects(object_entry_alloc);
 	strbuf_init(&command_buf);
 
 	atom_table = xcalloc(atom_table_sz, sizeof(struct atom_str*));
@@ -1955,7 +1947,7 @@ int main(int argc, const char **argv)
 
 	fprintf(stderr, "%s statistics:\n", argv[0]);
 	fprintf(stderr, "---------------------------------------------------------------------\n");
-	fprintf(stderr, "Alloc'd objects: %10ju (%10ju overflow  )\n", alloc_count, alloc_count - est_obj_cnt);
+	fprintf(stderr, "Alloc'd objects: %10ju\n", alloc_count);
 	fprintf(stderr, "Total objects:   %10ju (%10ju duplicates                  )\n", total_count, duplicate_count);
 	fprintf(stderr, "      blobs  :   %10ju (%10ju duplicates %10ju deltas)\n", object_count_by_type[OBJ_BLOB], duplicate_count_by_type[OBJ_BLOB], delta_count_by_type[OBJ_BLOB]);
 	fprintf(stderr, "      trees  :   %10ju (%10ju duplicates %10ju deltas)\n", object_count_by_type[OBJ_TREE], duplicate_count_by_type[OBJ_TREE], delta_count_by_type[OBJ_TREE]);

From 3b4dce02752d37c3cef9308eefb01ed758efe323 Mon Sep 17 00:00:00 2001
From: "Shawn O. Pearce" <spearce@spearce.org>
Date: Thu, 18 Jan 2007 13:14:27 -0500
Subject: [PATCH 65/81] Support delimited data regions in fast-import.

During testing its nice to not have to feed the length of a data
chunk to the 'data' command of fast-import.  Instead we would
prefer to be able to establish a data chunk much like shell's <<
operator and use a line delimiter to denote the end of the input.

So now if a data command is started as 'data <<EOF' we will look
for a terminator line containing only the string EOF on that line.
Once found, we stop the data command.  Everything between the two
lines is used as the data value.

The 'data <<' syntax is slower than 'data n', as we don't know how
many bytes to expect and instead must grow our buffer on the fly.
It also has the problem that the frontend must use a string which
will not appear on a line by itself in the input, and the data
region will always end in an LF.  For these reasons real import
frontends are encouraged to continue to use _only_ 'data n'.

Signed-off-by: Shawn O. Pearce <spearce@spearce.org>
---
 fast-import.c | 64 +++++++++++++++++++++++++++++++++++++++------------
 1 file changed, 49 insertions(+), 15 deletions(-)

diff --git a/fast-import.c b/fast-import.c
index 9a642f2e02..90adc68042 100644
--- a/fast-import.c
+++ b/fast-import.c
@@ -50,14 +50,21 @@ Format of STDIN stream:
      # a new mark directive with the old idnum.
 	 #
   mark ::= 'mark' sp idnum lf;
+  data ::= (delimited_data | exact_data)
+    lf;
+
+    # note: delim may be any string but must not contain lf.
+    # data_line may contain any data but must not be exactly
+    # delim.
+  delimited_data ::= 'data' sp '<<' delim lf
+    (data_line lf)*
+	delim lf;
 
      # note: declen indicates the length of binary_data in bytes.
-     # declen does not include the lf preceeding or trailing the
-     # binary data.
+     # declen does not include the lf preceeding the binary data.
      #
-  data ::= 'data' sp declen lf
-    binary_data
-	lf;
+  exact_data ::= 'data' sp declen lf
+    binary_data;
 
      # note: quoted strings are C-style quoting supporting \c for
      # common escapes of 'c' (e..g \n, \t, \\, \") or \nnn where nnn
@@ -1334,21 +1341,48 @@ static void cmd_mark(void)
 
 static void* cmd_data (size_t *size)
 {
-	size_t n = 0;
-	void *buffer;
 	size_t length;
+	char *buffer;
 
 	if (strncmp("data ", command_buf.buf, 5))
 		die("Expected 'data n' command, found: %s", command_buf.buf);
 
-	length = strtoul(command_buf.buf + 5, NULL, 10);
-	buffer = xmalloc(length);
-
-	while (n < length) {
-		size_t s = fread((char*)buffer + n, 1, length - n, stdin);
-		if (!s && feof(stdin))
-			die("EOF in data (%lu bytes remaining)", length - n);
-		n += s;
+	if (!strncmp("<<", command_buf.buf + 5, 2)) {
+		char *term = xstrdup(command_buf.buf + 5 + 2);
+		size_t sz = 8192, term_len = command_buf.len - 5 - 2;
+		length = 0;
+		buffer = xmalloc(sz);
+		for (;;) {
+			read_next_command();
+			if (command_buf.eof)
+				die("EOF in data (terminator '%s' not found)", term);
+			if (term_len == command_buf.len
+				&& !strcmp(term, command_buf.buf))
+				break;
+			if (sz < (length + command_buf.len)) {
+				sz = sz * 3 / 2 + 16;
+				if (sz < (length + command_buf.len))
+					sz = length + command_buf.len;
+				buffer = xrealloc(buffer, sz);
+			}
+			memcpy(buffer + length,
+				command_buf.buf,
+				command_buf.len - 1);
+			length += command_buf.len - 1;
+			buffer[length++] = '\n';
+		}
+		free(term);
+	}
+	else {
+		size_t n = 0;
+		length = strtoul(command_buf.buf + 5, NULL, 10);
+		buffer = xmalloc(length);
+		while (n < length) {
+			size_t s = fread(buffer + n, 1, length - n, stdin);
+			if (!s && feof(stdin))
+				die("EOF in data (%lu bytes remaining)", length - n);
+			n += s;
+		}
 	}
 
 	if (fgetc(stdin) != '\n')

From 50aee995121a103fe2698574e7f1d56660a5b89b Mon Sep 17 00:00:00 2001
From: "Shawn O. Pearce" <spearce@spearce.org>
Date: Thu, 18 Jan 2007 13:26:24 -0500
Subject: [PATCH 66/81] Create test case for fast-import.

Now that its easier to craft test cases (thanks to 'data <<')
we should start to verify fast-import works as expected.

Signed-off-by: Shawn O. Pearce <spearce@spearce.org>
---
 t/t9300-fast-import.sh | 184 +++++++++++++++++++++++++++++++++++++++++
 1 file changed, 184 insertions(+)
 create mode 100755 t/t9300-fast-import.sh

diff --git a/t/t9300-fast-import.sh b/t/t9300-fast-import.sh
new file mode 100755
index 0000000000..1463476b21
--- /dev/null
+++ b/t/t9300-fast-import.sh
@@ -0,0 +1,184 @@
+#!/bin/sh
+#
+# Copyright (c) 2007 Shawn Pearce
+#
+
+test_description='test git-fast-import utility'
+. ./test-lib.sh
+. ../diff-lib.sh ;# test-lib chdir's into trash
+
+###
+### series A
+###
+
+test_tick
+cat >input <<INPUT_END
+blob
+mark :2
+data <<EOF
+file2
+second line of EOF
+EOF
+
+blob
+mark :3
+data <<END
+EOF
+in 3rd file
+ END
+END
+
+blob
+mark :4
+data 4
+abcd
+commit refs/heads/master
+mark :5
+committer $GIT_COMMITTER_NAME <$GIT_COMMITTER_EMAIL> $GIT_COMMITTER_DATE
+data <<COMMIT
+initial
+COMMIT
+
+M 644 :2 file2
+M 644 :3 file3
+M 755 :4 file4
+
+INPUT_END
+test_expect_success \
+    'A: create pack from stdin' \
+    'git-fast-import --export-marks=marks.out <input &&
+	 git-whatchanged master'
+test_expect_success \
+	'A: verify pack' \
+	'for p in .git/objects/pack/*.pack;do git-verify-pack $p||exit;done'
+
+cat >expect <<EOF
+author $GIT_COMMITTER_NAME <$GIT_COMMITTER_EMAIL> $GIT_COMMITTER_DATE
+committer $GIT_COMMITTER_NAME <$GIT_COMMITTER_EMAIL> $GIT_COMMITTER_DATE
+
+initial
+EOF
+test_expect_success \
+	'A: verify commit' \
+	'git-cat-file commit master | sed 1d >actual &&
+	diff -u expect actual'
+
+cat >expect <<EOF
+100644 blob file2
+100644 blob file3
+100755 blob file4
+EOF
+test_expect_success \
+	'A: verify tree' \
+	'git-cat-file -p master^{tree} | sed "s/ [0-9a-f]*	/ /" >actual &&
+	 diff -u expect actual'
+
+cat >expect <<EOF
+file2
+second line of EOF
+EOF
+test_expect_success \
+	'A: verify file2' \
+	'git-cat-file blob master:file2 >actual && diff -u expect actual'
+
+cat >expect <<END
+EOF
+in 3rd file
+ END
+END
+test_expect_success \
+	'A: verify file3' \
+	'git-cat-file blob master:file3 >actual && diff -u expect actual'
+
+printf abcd >expect
+test_expect_success \
+	'A: verify file4' \
+	'git-cat-file blob master:file4 >actual && diff -u expect actual'
+
+cat >expect <<EOF
+:2 `git-rev-parse --verify master:file2`
+:3 `git-rev-parse --verify master:file3`
+:4 `git-rev-parse --verify master:file4`
+:5 `git-rev-parse --verify master^0`
+EOF
+test_expect_success \
+	'A: verify marks output' \
+	'diff -u expect marks.out'
+
+###
+### series B
+###
+
+test_tick
+cat >input <<INPUT_END
+commit refs/heads/branch
+mark :1
+committer $GIT_COMMITTER_NAME <$GIT_COMMITTER_EMAIL> $GIT_COMMITTER_DATE
+data <<COMMIT
+corrupt
+COMMIT
+
+from refs/heads/master
+M 755 0000000000000000000000000000000000000001 zero1
+
+INPUT_END
+test_expect_failure \
+    'B: fail on invalid blob sha1' \
+    'git-fast-import <input'
+rm -f .git/objects/pack_* .git/objects/index_*
+
+###
+### series C
+###
+
+newf=`echo hi newf | git-hash-object -w --stdin`
+oldf=`git-rev-parse --verify master:file2`
+test_tick
+cat >input <<INPUT_END
+commit refs/heads/branch
+committer $GIT_COMMITTER_NAME <$GIT_COMMITTER_EMAIL> $GIT_COMMITTER_DATE
+data <<COMMIT
+second
+COMMIT
+
+from refs/heads/master
+M 644 $oldf file2/oldf
+M 755 $newf file2/newf
+D file3
+
+INPUT_END
+test_expect_success \
+    'C: incremental import create pack from stdin' \
+    'git-fast-import <input &&
+	 git-whatchanged branch'
+test_expect_success \
+	'C: verify pack' \
+	'for p in .git/objects/pack/*.pack;do git-verify-pack $p||exit;done'
+test_expect_success \
+	'C: validate reuse existing blob' \
+	'test $newf = `git-rev-parse --verify branch:file2/newf`
+	 test $oldf = `git-rev-parse --verify branch:file2/oldf`'
+
+cat >expect <<EOF
+parent `git-rev-parse --verify master^0`
+author $GIT_COMMITTER_NAME <$GIT_COMMITTER_EMAIL> $GIT_COMMITTER_DATE
+committer $GIT_COMMITTER_NAME <$GIT_COMMITTER_EMAIL> $GIT_COMMITTER_DATE
+
+second
+EOF
+test_expect_success \
+	'C: verify commit' \
+	'git-cat-file commit branch | sed 1d >actual &&
+	 diff -u expect actual'
+
+cat >expect <<EOF
+:000000 100755 0000000000000000000000000000000000000000 f1fb5da718392694d0076d677d6d0e364c79b0bc A	file2/newf
+:100644 100644 7123f7f44e39be127c5eb701e5968176ee9d78b1 7123f7f44e39be127c5eb701e5968176ee9d78b1 R100	file2	file2/oldf
+:100644 000000 0d92e9f3374ae2947c23aa477cbc68ce598135f1 0000000000000000000000000000000000000000 D	file3
+EOF
+git-diff-tree -M -r master branch >actual
+test_expect_success \
+	'C: validate rename result' \
+	'compare_diff_raw expect actual'
+
+test_done

From 8232dc427fb4b92b38e74e9e93b52231a67e354f Mon Sep 17 00:00:00 2001
From: "Shawn O. Pearce" <spearce@spearce.org>
Date: Thu, 18 Jan 2007 14:49:05 -0500
Subject: [PATCH 67/81] Reduce value duplication in t9300-fast-import.

It is error prone to list the value of each file twice, instead we
should list the value only once early in the script and reuse the
shell variable when we need to access it.

Signed-off-by: Shawn O. Pearce <spearce@spearce.org>
---
 t/t9300-fast-import.sh | 34 +++++++++++++++++-----------------
 1 file changed, 17 insertions(+), 17 deletions(-)

diff --git a/t/t9300-fast-import.sh b/t/t9300-fast-import.sh
index 1463476b21..40b8c073bd 100755
--- a/t/t9300-fast-import.sh
+++ b/t/t9300-fast-import.sh
@@ -7,6 +7,16 @@ test_description='test git-fast-import utility'
 . ./test-lib.sh
 . ../diff-lib.sh ;# test-lib chdir's into trash
 
+file2_data='file2
+second line of EOF'
+
+file3_data='EOF
+in 3rd file
+ END'
+
+file4_data=abcd
+file4_len=4
+
 ###
 ### series A
 ###
@@ -16,22 +26,19 @@ cat >input <<INPUT_END
 blob
 mark :2
 data <<EOF
-file2
-second line of EOF
+$file2_data
 EOF
 
 blob
 mark :3
 data <<END
-EOF
-in 3rd file
- END
+$file3_data
 END
 
 blob
 mark :4
-data 4
-abcd
+data $file4_len
+$file4_data
 commit refs/heads/master
 mark :5
 committer $GIT_COMMITTER_NAME <$GIT_COMMITTER_EMAIL> $GIT_COMMITTER_DATE
@@ -73,24 +80,17 @@ test_expect_success \
 	'git-cat-file -p master^{tree} | sed "s/ [0-9a-f]*	/ /" >actual &&
 	 diff -u expect actual'
 
-cat >expect <<EOF
-file2
-second line of EOF
-EOF
+echo "$file2_data" >expect
 test_expect_success \
 	'A: verify file2' \
 	'git-cat-file blob master:file2 >actual && diff -u expect actual'
 
-cat >expect <<END
-EOF
-in 3rd file
- END
-END
+echo "$file3_data" >expect
 test_expect_success \
 	'A: verify file3' \
 	'git-cat-file blob master:file3 >actual && diff -u expect actual'
 
-printf abcd >expect
+printf "$file4_data" >expect
 test_expect_success \
 	'A: verify file4' \
 	'git-cat-file blob master:file4 >actual && diff -u expect actual'

From b715cfbba4083d25ec0d0f94e440ad734607ddb0 Mon Sep 17 00:00:00 2001
From: "Shawn O. Pearce" <spearce@spearce.org>
Date: Thu, 18 Jan 2007 15:17:58 -0500
Subject: [PATCH 68/81] Accept 'inline' file data in fast-import commit
 structure.

Its very annoying to need to specify the file content ahead of a
commit and use marks to connect the individual blobs to the commit's
file modification entry, especially if the frontend can't/won't
generate the blob SHA1s itself.  Instead it would much easier to
use if we can accept the blob data at the same time as we receive
each file_change line.

Now fast-import accepts 'inline' instead of a mark idnum or blob
SHA1 within the 'M' type file_change command.  If an inline is
detected the very next line must be a 'data n' command, supplying
the file data.

Signed-off-by: Shawn O. Pearce <spearce@spearce.org>
---
 fast-import.c          | 29 ++++++++++++++++-----
 t/t9300-fast-import.sh | 59 ++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 82 insertions(+), 6 deletions(-)

diff --git a/fast-import.c b/fast-import.c
index 90adc68042..487a91a4ee 100644
--- a/fast-import.c
+++ b/fast-import.c
@@ -25,10 +25,11 @@ Format of STDIN stream:
     lf;
   commit_msg ::= data;
 
-  file_change ::= 'M' sp mode sp (hexsha1 | idnum) sp path_str lf
-                | 'D' sp path_str lf
-                ;
-  mode ::= '644' | '755';
+  file_change ::= file_del | file_obm | file_inm;
+  file_del ::= 'D' sp path_str lf;
+  file_obm ::= 'M' sp mode sp (hexsha1 | idnum) sp path_str lf;
+  file_inm ::= 'M' sp mode sp 'inline' sp path_str lf
+    data;
 
   new_tag ::= 'tag' sp tag_str lf
     'from' sp (ref_str | hexsha1 | sha1exp_str | idnum) lf
@@ -77,6 +78,10 @@ Format of STDIN stream:
   sha1exp_str ::= sha1exp | '"' quoted(sha1exp) '"' ;
   tag_str     ::= tag     | '"' quoted(tag)     '"' ;
   path_str    ::= path    | '"' quoted(path)    '"' ;
+  mode        ::= '100644' | '644'
+                | '100755' | '755'
+                | '140000'
+                ;
 
   declen ::= # unsigned 32 bit value, ascii base10 notation;
   bigint ::= # unsigned integer value, ascii base10 notation;
@@ -1452,7 +1457,7 @@ static void file_change_m(struct branch *b)
 	const char *endp;
 	struct object_entry *oe;
 	unsigned char sha1[20];
-	unsigned int mode;
+	unsigned int mode, inline_data = 0;
 	char type[20];
 
 	p = get_mode(p, &mode);
@@ -1475,6 +1480,9 @@ static void file_change_m(struct branch *b)
 		oe = find_mark(strtoumax(p + 1, &x, 10));
 		hashcpy(sha1, oe->sha1);
 		p = x;
+	} else if (!strncmp("inline", p, 6)) {
+		inline_data = 1;
+		p += 6;
 	} else {
 		if (get_sha1_hex(p, sha1))
 			die("Invalid SHA1: %s", command_buf.buf);
@@ -1491,7 +1499,16 @@ static void file_change_m(struct branch *b)
 		p = p_uq;
 	}
 
-	if (oe) {
+	if (inline_data) {
+		size_t l;
+		void *d;
+		if (!p_uq)
+			p = p_uq = xstrdup(p);
+		read_next_command();
+		d = cmd_data(&l);
+		if (store_object(OBJ_BLOB, d, l, &last_blob, sha1, 0))
+			free(d);
+	} else if (oe) {
 		if (oe->type != OBJ_BLOB)
 			die("Not a blob (actually a %s): %s",
 				command_buf.buf, type_names[oe->type]);
diff --git a/t/t9300-fast-import.sh b/t/t9300-fast-import.sh
index 40b8c073bd..a5cc846b34 100755
--- a/t/t9300-fast-import.sh
+++ b/t/t9300-fast-import.sh
@@ -17,6 +17,12 @@ in 3rd file
 file4_data=abcd
 file4_len=4
 
+file5_data='an inline file.
+  we should see it later.'
+
+file6_data='#!/bin/sh
+echo "$@"'
+
 ###
 ### series A
 ###
@@ -181,4 +187,57 @@ test_expect_success \
 	'C: validate rename result' \
 	'compare_diff_raw expect actual'
 
+###
+### series D
+###
+
+test_tick
+cat >input <<INPUT_END
+commit refs/heads/branch
+committer $GIT_COMMITTER_NAME <$GIT_COMMITTER_EMAIL> $GIT_COMMITTER_DATE
+data <<COMMIT
+third
+COMMIT
+
+from refs/heads/branch^0
+M 644 inline newdir/interesting
+data <<EOF
+$file5_data
+EOF
+
+M 755 inline newdir/exec.sh
+data <<EOF
+$file6_data
+EOF
+
+INPUT_END
+test_expect_success \
+    'D: inline data in commit' \
+    'git-fast-import <input &&
+	 git-whatchanged branch'
+test_expect_success \
+	'D: verify pack' \
+	'for p in .git/objects/pack/*.pack;do git-verify-pack $p||exit;done'
+
+cat >expect <<EOF
+:000000 100755 0000000000000000000000000000000000000000 35a59026a33beac1569b1c7f66f3090ce9c09afc A	newdir/exec.sh
+:000000 100644 0000000000000000000000000000000000000000 046d0371e9220107917db0d0e030628de8a1de9b A	newdir/interesting
+EOF
+git-diff-tree -M -r branch^ branch >actual
+test_expect_success \
+	'D: validate new files added' \
+	'compare_diff_raw expect actual'
+
+echo "$file5_data" >expect
+test_expect_success \
+	'D: verify file5' \
+	'git-cat-file blob branch:newdir/interesting >actual &&
+	 diff -u expect actual'
+
+echo "$file6_data" >expect
+test_expect_success \
+	'D: verify file6' \
+	'git-cat-file blob branch:newdir/exec.sh >actual &&
+	 diff -u expect actual'
+
 test_done

From 8c1f22da9f8124dfabb5da8476845250b5c35ae8 Mon Sep 17 00:00:00 2001
From: "Shawn O. Pearce" <spearce@spearce.org>
Date: Mon, 5 Feb 2007 16:05:11 -0500
Subject: [PATCH 69/81] Include checkpoint command in the BNF.

This command isn't encouraged (as its slow) but it does exist and
is accepted, so it still should be covered in the BNF.

Signed-off-by: Shawn O. Pearce <spearce@spearce.org>
---
 fast-import.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/fast-import.c b/fast-import.c
index f62a5a9f6c..1559f9c0ff 100644
--- a/fast-import.c
+++ b/fast-import.c
@@ -7,6 +7,7 @@ Format of STDIN stream:
         | new_commit
         | new_tag
         | reset_branch
+        | checkpoint
         ;
 
   new_blob ::= 'blob' lf

From 10831c551323121bdab06c3eaf2f52c6658fd6b8 Mon Sep 17 00:00:00 2001
From: "Shawn O. Pearce" <spearce@spearce.org>
Date: Mon, 5 Feb 2007 16:34:56 -0500
Subject: [PATCH 70/81] Reduce memory usage of fast-import.

Some structs are allocated rather frequently, but were using integer
types which were far larger than required to actually store their
full value range.

As packfiles are limited to 4 GiB we don't need more than 32 bits to
store the offset of an object within that packfile, an `unsigned long`
on a 64 bit system is likely a 64 bit unsigned value.  Saving 4 bytes
per object on a 64 bit system can add up fast on any sizable import.

As atom strings are strictly single components in a path name these
are probably limited to just 255 bytes by the underlying OS.  Going
to that short of a string is probably too restrictive, but certainly
`unsigned int` is far too large for their lengths.  `unsigned short`
is a reasonable limit.

Modes within a tree really only need two bytes to store their whole
value; using `unsigned int` here is vast overkill.  Saving 4 bytes
per file entry in an active branch can add up quickly on a project
with a large number of files.

Signed-off-by: Shawn O. Pearce <spearce@spearce.org>
---
 fast-import.c | 24 ++++++++++++------------
 1 file changed, 12 insertions(+), 12 deletions(-)

diff --git a/fast-import.c b/fast-import.c
index 1559f9c0ff..9658c28413 100644
--- a/fast-import.c
+++ b/fast-import.c
@@ -130,7 +130,7 @@ Format of STDIN stream:
 struct object_entry
 {
 	struct object_entry *next;
-	unsigned long offset;
+	uint32_t offset;
 	unsigned type : TYPE_BITS;
 	unsigned pack_id : PACK_ID_BITS;
 	unsigned char sha1[20];
@@ -157,7 +157,7 @@ struct last_object
 {
 	void *data;
 	unsigned long len;
-	unsigned long offset;
+	uint32_t offset;
 	unsigned int depth;
 	unsigned no_free:1;
 };
@@ -173,7 +173,7 @@ struct mem_pool
 struct atom_str
 {
 	struct atom_str *next_atom;
-	unsigned int str_len;
+	unsigned short str_len;
 	char str_dat[FLEX_ARRAY]; /* more */
 };
 
@@ -184,7 +184,7 @@ struct tree_entry
 	struct atom_str* name;
 	struct tree_entry_ms
 	{
-		unsigned int mode;
+		uint16_t mode;
 		unsigned char sha1[20];
 	} versions[2];
 };
@@ -464,7 +464,7 @@ static struct object_entry* find_mark(uintmax_t idnum)
 	return oe;
 }
 
-static struct atom_str* to_atom(const char *s, size_t len)
+static struct atom_str* to_atom(const char *s, unsigned short len)
 {
 	unsigned int hc = hc_str(s, len) % atom_table_sz;
 	struct atom_str *c;
@@ -993,10 +993,10 @@ static void *gfi_unpack_entry(
 	return unpack_entry(p, oe->offset, type, sizep);
 }
 
-static const char *get_mode(const char *str, unsigned int *modep)
+static const char *get_mode(const char *str, uint16_t *modep)
 {
 	unsigned char c;
-	unsigned int mode = 0;
+	uint16_t mode = 0;
 
 	while ((c = *str++) != ' ') {
 		if (c < '0' || c > '7')
@@ -1046,7 +1046,7 @@ static void load_tree(struct tree_entry *root)
 		if (!c)
 			die("Corrupt mode in %s", sha1_to_hex(sha1));
 		e->versions[0].mode = e->versions[1].mode;
-		e->name = to_atom(c, strlen(c));
+		e->name = to_atom(c, (unsigned short)strlen(c));
 		c += e->name->str_len + 1;
 		hashcpy(e->versions[0].sha1, (unsigned char*)c);
 		hashcpy(e->versions[1].sha1, (unsigned char*)c);
@@ -1098,7 +1098,7 @@ static void mktree(struct tree_content *t,
 		struct tree_entry *e = t->entries[i];
 		if (!e->versions[v].mode)
 			continue;
-		c += sprintf(c, "%o", e->versions[v].mode);
+		c += sprintf(c, "%o", (unsigned int)e->versions[v].mode);
 		*c++ = ' ';
 		strcpy(c, e->name->str_dat);
 		c += e->name->str_len + 1;
@@ -1161,7 +1161,7 @@ static int tree_content_set(
 	struct tree_entry *root,
 	const char *p,
 	const unsigned char *sha1,
-	const unsigned int mode)
+	const uint16_t mode)
 {
 	struct tree_content *t = root->tree;
 	const char *slash1;
@@ -1207,7 +1207,7 @@ static int tree_content_set(
 	if (t->entry_count == t->entry_capacity)
 		root->tree = t = grow_tree_content(t, 8);
 	e = new_tree_entry();
-	e->name = to_atom(p, n);
+	e->name = to_atom(p, (unsigned short)n);
 	e->versions[0].mode = 0;
 	hashclr(e->versions[0].sha1);
 	t->entries[t->entry_count++] = e;
@@ -1458,7 +1458,7 @@ static void file_change_m(struct branch *b)
 	const char *endp;
 	struct object_entry *oe;
 	unsigned char sha1[20];
-	unsigned int mode, inline_data = 0;
+	uint16_t mode, inline_data = 0;
 	char type[20];
 
 	p = get_mode(p, &mode);

From 6c3aac1c69ea0bcb2896bec96a01fdf8aa6176fa Mon Sep 17 00:00:00 2001
From: "Shawn O. Pearce" <spearce@spearce.org>
Date: Mon, 5 Feb 2007 20:30:37 -0500
Subject: [PATCH 71/81] Don't support shell-quoted refnames in fast-import.

The current implementation of shell-style quoted refnames and
SHA-1 expressions within fast-import contains a bad memory leak.
We leak the unquoted strings used by the `from` and `merge`
commands, maybe others.  Its also just muddling up the docs.

Since Git refnames cannot contain LF, and that is our delimiter
for the end of the refname, and we accept any other character
as-is, there is no reason for these strings to support quoting,
except to be nice to frontends.  But frontends shouldn't be
expecting to use funny refs in Git, and its just as simple to
never quote them as it is to always pass them through the same
quoting filter as pathnames.  So frontends should never quote
refs, or ref expressions.

Signed-off-by: Shawn O. Pearce <spearce@spearce.org>
---
 fast-import.c | 67 ++++-----------------------------------------------
 1 file changed, 5 insertions(+), 62 deletions(-)

diff --git a/fast-import.c b/fast-import.c
index 9658c28413..e6342386fc 100644
--- a/fast-import.c
+++ b/fast-import.c
@@ -75,9 +75,9 @@ Format of STDIN stream:
      # stream formatting is: \, " and LF.  Otherwise these values
 	 # are UTF8.
      #
-  ref_str     ::= ref     | '"' quoted(ref)     '"' ;
-  sha1exp_str ::= sha1exp | '"' quoted(sha1exp) '"' ;
-  tag_str     ::= tag     | '"' quoted(tag)     '"' ;
+  ref_str     ::= ref;
+  sha1exp_str ::= sha1exp;
+  tag_str     ::= tag;
   path_str    ::= path    | '"' quoted(path)    '"' ;
   mode        ::= '100644' | '644'
                 | '100755' | '755'
@@ -1546,8 +1546,7 @@ static void file_change_d(struct branch *b)
 
 static void cmd_from(struct branch *b)
 {
-	const char *from, *endp;
-	char *str_uq;
+	const char *from;
 	struct branch *s;
 
 	if (strncmp("from ", command_buf.buf, 5))
@@ -1557,13 +1556,6 @@ static void cmd_from(struct branch *b)
 		die("Can't reinitailize branch %s", b->name);
 
 	from = strchr(command_buf.buf, ' ') + 1;
-	str_uq = unquote_c_style(from, &endp);
-	if (str_uq) {
-		if (*endp)
-			die("Garbage after string in: %s", command_buf.buf);
-		from = str_uq;
-	}
-
 	s = lookup_branch(from);
 	if (b == s)
 		die("Can't create a branch from itself: %s", b->name);
@@ -1617,20 +1609,12 @@ static void cmd_from(struct branch *b)
 static struct hash_list* cmd_merge(unsigned int *count)
 {
 	struct hash_list *list = NULL, *n, *e;
-	const char *from, *endp;
-	char *str_uq;
+	const char *from;
 	struct branch *s;
 
 	*count = 0;
 	while (!strncmp("merge ", command_buf.buf, 6)) {
 		from = strchr(command_buf.buf, ' ') + 1;
-		str_uq = unquote_c_style(from, &endp);
-		if (str_uq) {
-			if (*endp)
-				die("Garbage after string in: %s", command_buf.buf);
-			from = str_uq;
-		}
-
 		n = xmalloc(sizeof(*n));
 		s = lookup_branch(from);
 		if (s)
@@ -1661,8 +1645,6 @@ static void cmd_new_commit(void)
 	struct branch *b;
 	void *msg;
 	size_t msglen;
-	char *str_uq;
-	const char *endp;
 	char *sp;
 	char *author = NULL;
 	char *committer = NULL;
@@ -1671,17 +1653,9 @@ static void cmd_new_commit(void)
 
 	/* Obtain the branch name from the rest of our command */
 	sp = strchr(command_buf.buf, ' ') + 1;
-	str_uq = unquote_c_style(sp, &endp);
-	if (str_uq) {
-		if (*endp)
-			die("Garbage after ref in: %s", command_buf.buf);
-		sp = str_uq;
-	}
 	b = lookup_branch(sp);
 	if (!b)
 		b = new_branch(sp);
-	if (str_uq)
-		free(str_uq);
 
 	read_next_command();
 	cmd_mark();
@@ -1772,8 +1746,6 @@ static void cmd_new_commit(void)
 
 static void cmd_new_tag(void)
 {
-	char *str_uq;
-	const char *endp;
 	char *sp;
 	const char *from;
 	char *tagger;
@@ -1786,12 +1758,6 @@ static void cmd_new_tag(void)
 
 	/* Obtain the new tag name from the rest of our command */
 	sp = strchr(command_buf.buf, ' ') + 1;
-	str_uq = unquote_c_style(sp, &endp);
-	if (str_uq) {
-		if (*endp)
-			die("Garbage after tag name in: %s", command_buf.buf);
-		sp = str_uq;
-	}
 	t = pool_alloc(sizeof(struct tag));
 	t->next_tag = NULL;
 	t->name = pool_strdup(sp);
@@ -1800,22 +1766,12 @@ static void cmd_new_tag(void)
 	else
 		first_tag = t;
 	last_tag = t;
-	if (str_uq)
-		free(str_uq);
 	read_next_command();
 
 	/* from ... */
 	if (strncmp("from ", command_buf.buf, 5))
 		die("Expected from command, got %s", command_buf.buf);
-
 	from = strchr(command_buf.buf, ' ') + 1;
-	str_uq = unquote_c_style(from, &endp);
-	if (str_uq) {
-		if (*endp)
-			die("Garbage after string in: %s", command_buf.buf);
-		from = str_uq;
-	}
-
 	s = lookup_branch(from);
 	if (s) {
 		hashcpy(sha1, s->sha1);
@@ -1836,9 +1792,6 @@ static void cmd_new_tag(void)
 		free(buf);
 	} else
 		die("Invalid ref name or SHA1 expression: %s", from);
-
-	if (str_uq)
-		free(str_uq);
 	read_next_command();
 
 	/* tagger ... */
@@ -1885,18 +1838,10 @@ static void cmd_new_tag(void)
 static void cmd_reset_branch(void)
 {
 	struct branch *b;
-	char *str_uq;
-	const char *endp;
 	char *sp;
 
 	/* Obtain the branch name from the rest of our command */
 	sp = strchr(command_buf.buf, ' ') + 1;
-	str_uq = unquote_c_style(sp, &endp);
-	if (str_uq) {
-		if (*endp)
-			die("Garbage after ref in: %s", command_buf.buf);
-		sp = str_uq;
-	}
 	b = lookup_branch(sp);
 	if (b) {
 		b->last_commit = 0;
@@ -1907,8 +1852,6 @@ static void cmd_reset_branch(void)
 	}
 	else
 		b = new_branch(sp);
-	if (str_uq)
-		free(str_uq);
 	read_next_command();
 	cmd_from(b);
 }

From 6e411d2044072072692f2d9cf9d633421ef6017a Mon Sep 17 00:00:00 2001
From: "Shawn O. Pearce" <spearce@spearce.org>
Date: Mon, 5 Feb 2007 21:09:25 -0500
Subject: [PATCH 72/81] Initial draft of fast-import documentation.

This is a first pass at the manpage for git-fast-import.

I have tried to cover the input format in extreme detail, creating a
reference which is more detailed than the BNF grammar appearing in
the header of fast-import.c.  I have also covered some details about
gfi's performance and memory utilization, as well as the average
learning curve required to create a gfi frontend application (as it
is far lower than it might appear on first glance).

The documentation still lacks real example input streams, which may
turn out to be difficult to format in asciidoc due to the blank lines
which carry meaning within the format.

Signed-off-by: Shawn O. Pearce <spearce@spearce.org>
---
 Documentation/git-fast-import.txt | 655 ++++++++++++++++++++++++++++++
 1 file changed, 655 insertions(+)
 create mode 100644 Documentation/git-fast-import.txt

diff --git a/Documentation/git-fast-import.txt b/Documentation/git-fast-import.txt
new file mode 100644
index 0000000000..16308731fb
--- /dev/null
+++ b/Documentation/git-fast-import.txt
@@ -0,0 +1,655 @@
+git-fast-import(1)
+==================
+
+NAME
+----
+git-fast-import - Backend for fast Git data importers.
+
+
+SYNOPSIS
+--------
+frontend | 'git-fast-import' [options]
+
+DESCRIPTION
+-----------
+This program is usually not what the end user wants to run directly.
+Most end users want to use one of the existing frontend programs,
+which parses a specific type of foreign source and feeds the contents
+stored there to git-fast-import (gfi).
+
+gfi reads a mixed command/data stream from standard input and
+writes one or more packfiles directly into the current repository.
+When EOF is received on standard input, fast import writes out
+updated branch and tag refs, fully updating the current repository
+with the newly imported data.
+
+The gfi backend itself can import into an empty repository (one that
+has already been initialized by gitlink:git-init[1]) or incrementally
+update an existing populated repository.  Whether or not incremental
+imports are supported from a particular foreign source depends on
+the frontend program in use.
+
+
+OPTIONS
+-------
+--max-pack-size=<n>::
+	Maximum size of each output packfile, expressed in MiB.
+	The default is 4096 (4 GiB) as that is the maximum allowed
+	packfile size (due to file format limitations). Some
+	importers may wish to lower this, such as to ensure the
+	resulting packfiles fit on CDs.
+
+--depth=<n>::
+	Maximum delta depth, for blob and tree deltification.
+	Default is 10.
+
+--active-branches=<n>::
+	Maximum number of branches to maintain active at once.
+	See ``Memory Utilization'' below for details.  Default is 5.
+
+--export-marks=<file>::
+	Dumps the internal marks table to <file> when complete.
+	Marks are written one per line as `:markid SHA-1`.
+	Frontends can use this file to validate imports after they
+	have been completed.
+
+--branch-log=<file>::
+	Records every tag and commit made to a log file.  (This file
+	can be quite verbose on large imports.)  This particular
+	option has been primarily intended to facilitate debugging
+	gfi and has limited usefulness in other contexts.  It may
+	be removed in future versions.
+
+
+Performance
+-----------
+The design of gfi allows it to import large projects in a minimum
+amount of memory usage and processing time.  Assuming the frontend
+is able to keep up with gfi and feed it a constant stream of data,
+import times for projects holding 10+ years of history and containing
+100,000+ individual commits are generally completed in just 1-2
+hours on quite modest (~$2,000 USD) hardware.
+
+Most bottlenecks appear to be in foreign source data access (the
+source just cannot extract revisions fast enough) or disk IO (gfi
+writes as fast as the disk will take the data).  Imports will run
+faster if the source data is stored on a different drive than the
+destination Git repository (due to less IO contention).
+
+
+Development Cost
+----------------
+A typical frontend for gfi tends to weigh in at approximately 200
+lines of Perl/Python/Ruby code.  Most developers have been able to
+create working importers in just a couple of hours, even though it
+is their first exposure to gfi, and sometimes even to Git.  This is
+an ideal situation, given that most conversion tools are throw-away
+(use once, and never look back).
+
+
+Parallel Operation
+------------------
+Like `git-push` or `git-fetch`, imports handled by gfi are safe to
+run alongside parallel `git repack -a -d` or `git gc` invocations,
+or any other Git operation (including `git prune`, as loose objects
+are never used by gfi).
+
+However, gfi does not lock the branch or tag refs it is actively
+importing.  After EOF, during its ref update phase, gfi blindly
+overwrites each imported branch or tag ref.  Consequently it is not
+safe to modify refs that are currently being used by a running gfi
+instance, as work could be lost when gfi overwrites the refs.
+
+
+Technical Discussion
+--------------------
+gfi tracks a set of branches in memory.  Any branch can be created
+or modified at any point during the import process by sending a
+`commit` command on the input stream.  This design allows a frontend
+program to process an unlimited number of branches simultaneously,
+generating commits in the order they are available from the source
+data.  It also simplifies the frontend programs considerably.
+
+gfi does not use or alter the current working directory, or any
+file within it.  (It does however update the current Git repository,
+as referenced by `GIT_DIR`.)  Therefore an import frontend may use
+the working directory for its own purposes, such as extracting file
+revisions from the foreign source.  This ignorance of the working
+directory also allows gfi to run very quickly, as it does not
+need to perform any costly file update operations when switching
+between branches.
+
+Input Format
+------------
+With the exception of raw file data (which Git does not interpret)
+the gfi input format is text (ASCII) based.  This text based
+format simplifies development and debugging of frontend programs,
+especially when a higher level language such as Perl, Python or
+Ruby is being used.
+
+gfi is very strict about its input.  Where we say SP below we mean
+*exactly* one space.  Likewise LF means one (and only one) linefeed.
+Supplying additional whitespace characters will cause unexpected
+results, such as branch names or file names with leading or trailing
+spaces in their name, or early termination of gfi when it encounters
+unexpected input.
+
+Commands
+~~~~~~~~
+gfi accepts several commands to update the current repository
+and control the current import process.  More detailed discussion
+(with examples) of each command follows later.
+
+`commit`::
+	Creates a new branch or updates an existing branch by
+	creating a new commit and updating the branch to point at
+	the newly created commit.
+
+`tag`::
+	Creates an annotated tag object from an existing commit or
+	branch.  Lightweight tags are not supported by this command,
+	as they are not recommended for recording meaningful points
+	in time.
+
+`reset`::
+	Reset an existing branch (or a new branch) to a specific
+	revision.  This command must be used to change a branch to
+	a specific revision without making a commit on it.
+
+`blob`::
+	Convert raw file data into a blob, for future use in a
+	`commit` command.  This command is optional and is not
+	needed to perform an import.
+
+`checkpoint`::
+	Forces gfi to close the current packfile, generate its
+	unique SHA-1 checksum and index, and start a new packfile.
+	This command is optional and is not needed to perform
+	an import.
+
+`commit`
+~~~~~~~~
+Create or update a branch with a new commit, recording one logical
+change to the project.
+
+....
+	'commit' SP <ref> LF
+	mark?
+	('author' SP <name> SP LT <email> GT SP <time> SP <tz> LF)?
+	'committer' SP <name> SP LT <email> GT SP <time> SP <tz> LF
+	data
+	('from' SP <committish> LF)?
+	('merge' SP <committish> LF)?
+	(filemodify | filedelete)*
+	LF
+....
+
+where `<ref>` is the name of the branch to make the commit on.
+Typically branch names are prefixed with `refs/heads/` in
+Git, so importing the CVS branch symbol `RELENG-1_0` would use
+`refs/heads/RELENG-1_0` for the value of `<ref>`.  The value of
+`<ref>` must be a valid refname in Git.  As `LF` is not valid in
+a Git refname, no quoting or escaping syntax is supported here.
+
+A `mark` command may optionally appear, requesting gfi to save a
+reference to the newly created commit for future use by the frontend
+(see below for format).  It is very common for frontends to mark
+every commit they create, thereby allowing future branch creation
+from any imported commit.
+
+The `data` command following `committer` must supply the commit
+message (see below for `data` command syntax).  To import an empty
+commit message use a 0 length data.  Commit messages are free-form
+and are not interpreted by Git.  Currently they must be encoded in
+UTF-8, as gfi does not permit other encodings to be specified.
+
+Zero or more `filemodify` and `filedelete` commands may be
+included to update the contents of the branch prior to the commit.
+These commands can be supplied in any order, gfi is not sensitive
+to pathname or operation ordering.
+
+`author`
+^^^^^^^^
+An `author` command may optionally appear, if the author information
+might differ from the committer information.  If `author` is omitted
+then gfi will automatically use the committer's information for
+the author portion of the commit.  See below for a description of
+the fields in `author`, as they are identical to `committer`.
+
+`committer`
+^^^^^^^^^^^
+The `committer` command indicates who made this commit, and when
+they made it.
+
+Here `<name>` is the person's display name (for example
+``Com M Itter'') and `<email>` is the person's email address
+(``cm@example.com'').  `LT` and `GT` are the literal less-than (\x3c)
+and greater-than (\x3e) symbols.  These are required to delimit
+the email address from the other fields in the line.  Note that
+`<name>` is free-form and may contain any sequence of bytes, except
+`LT` and `LF`.  It is typically UTF-8 encoded.
+
+The time of the change is specified by `<time>` as the number of
+seconds since the UNIX epoc (midnight, Jan 1, 1970, UTC) and is
+written in base-10 notation using US-ASCII digits.  The committer's
+timezone is specified by `<tz>` as a positive or negative offset
+from UTC, in minutes.  For example EST would be expressed in `<tz>`
+by ``-0500''.
+
+`from`
+^^^^^^
+Only valid for the first commit made on this branch by this
+gfi process.  The `from` command is used to specify the commit
+to initialize this branch from.  This revision will be the first
+ancestor of the new commit.
+
+Omitting the `from` command in the first commit of a new branch will
+cause gfi to create that commit with no ancestor. This tends to be
+desired only for the initial commit of a project.  Omitting the
+`from` command on existing branches is required, as the current
+commit on that branch is automatically assumed to be the first
+ancestor of the new commit.
+
+As `LF` is not valid in a Git refname or SHA-1 expression, no
+quoting or escaping syntax is supported within `<committish>`.
+
+Here `<committish>` is any of the following:
+
+* The name of an existing branch already in gfi's internal branch
+  table.  If gfi doesn't know the name, its treated as a SHA-1
+  expression.
+
+* A mark reference, `:<idnum>`, where `<idnum>` is the mark number.
++
+The reason gfi uses `:` to denote a mark reference is this character
+is not legal in a Git branch name.  The leading `:` makes it easy
+to distingush between the mark 42 (`:42`) and the branch 42 (`42`
+or `refs/heads/42`), or an abbreviated SHA-1 which happened to
+consist only of base-10 digits.
++
+Marks must be declared (via `mark`) before they can be used.
+
+* A complete 40 byte or abbreviated commit SHA-1 in hex.
+
+* Any valid Git SHA-1 expression that resolves to a commit.  See
+  ``SPECIFYING REVISIONS'' in gitlink:git-rev-parse[1] for details.
+
+The special case of restarting an incremental import from the
+current branch value should be written as:
+----
+	from refs/heads/branch^0
+----
+The `^0` suffix is necessary as gfi does not permit a branch to
+start from itself, and the branch is created in memory before the
+`from` command is even read from the input.  Adding `^0` will force
+gfi to resolve the commit through Git's revision parsing library,
+rather than its internal branch table, thereby loading in the
+existing value of the branch.
+
+`merge`
+^^^^^^^
+Includes one additional ancestor commit, and makes the current
+commit a merge commit.  An unlimited number of `merge` commands per
+commit are permitted by gfi, thereby establishing an n-way merge.
+However Git's other tools never create commits with more than 15
+additional ancestors (forming a 16-way merge).  For this reason
+it is suggested that frontends do not use more than 15 `merge`
+commands per commit.
+
+Here `<committish>` is any of the commit specification expressions
+also accepted by `from` (see above).
+
+`filemodify`
+^^^^^^^^^^
+Included in a `commit` command to add a new file or change the
+content of an existing file.  This command has two different means
+of specifying the content of the file.
+
+External data format::
+	The data content for the file was already supplied by a prior
+	`blob` command.  The frontend just needs to connect it.
++
+....
+	'M' SP <mode> SP <dataref> SP <path> LF
+....
++
+Here `<dataref>` can be either a mark reference (`:<idnum>`)
+set by a prior `blob` command, or a full 40-byte SHA-1 of an
+existing Git blob object.
+
+Inline data format::
+	The data content for the file has not been supplied yet.
+	The frontend wants to supply it as part of this modify
+	command.
++
+....
+	'M' SP <mode> SP 'inline' SP <path> LF
+	data
+....
++
+See below for a detailed description of the `data` command.
+
+In both formats `<mode>` is the type of file entry, specified
+in octal.  Git only supports the following modes:
+
+* `100644` or `644`: A normal (not-executable) file.  The majority
+  of files in most projects use this mode.  If in doubt, this is
+  what you want.
+* `100755` or `755`: A normal, but executable, file.
+* `140000`: A symlink, the content of the file will be the link target.
+
+In both formats `<path>` is the complete path of the file to be added
+(if not already existing) or modified (if already existing).
+
+A `<path>` string must use UNIX-style directory seperators (forward
+slash `/`), may contain any byte other than `LF`, and must not
+start with double quote (`"`).
+
+If an `LF` or double quote must be encoded into `<path>` shell-style
+quoting should be used, e.g. `"path/with\n and \" in it"`.
+
+The value of `<path>` must be in canoncial form. That is it must not:
+
+* contain an empty directory component (e.g. `foo//bar` is invalid),
+* end with a directory seperator (e.g. `foo/` is invalid),
+* start with a directory seperator (e.g. `/foo` is invalid),
+* contain the special component `.` or `..` (e.g. `foo/./bar` and
+  `foo/../bar` are invalid).
+
+It is recommended that `<path>` always be encoded using UTF-8.
+
+
+`filedelete`
+^^^^^^^^^^
+Included in a `commit` command to remove a file from the branch.
+If the file removal makes its directory empty, the directory will
+be automatically removed too.  This cascades up the tree until the
+first non-empty directory or the root is reached.
+
+....
+	'D' SP <path> LF
+....
+
+here `<path>` is the complete path of the file to be removed.
+See `filemodify` above for a detailed description of `<path>`.
+
+`mark`
+~~~~~~
+Arranges for gfi to save a reference to the current object, allowing
+the frontend to recall this object at a future point in time, without
+knowing its SHA-1.  Here the current object is the object creation
+command the `mark` command appears within.  This can be `commit`,
+`tag`, and `blob`, but `commit` is the most common usage.
+
+....
+	'mark' SP ':' <idnum> LF
+....
+
+where `<idnum>` is the number assigned by the frontend to this mark.
+The value of `<idnum>` is expressed in base 10 notation using
+US-ASCII digits.  The value 0 is reserved and cannot be used as
+a mark.  Only values greater than or equal to 1 may be used as marks.
+
+New marks are created automatically.  Existing marks can be moved
+to another object simply by reusing the same `<idnum>` in another
+`mark` command.
+
+`tag`
+~~~~~
+Creates an annotated tag referring to a specific commit.  To create
+lightweight (non-annotated) tags see the `reset` command below.
+
+....
+	'tag' SP <name> LF
+	'from' SP <committish> LF
+	'tagger' SP <name> SP LT <email> GT SP <time> SP <tz> LF
+	data
+	LF
+....
+
+where `<name>` is the name of the tag to create.
+
+Tag names are automatically prefixed with `refs/tags/` when stored
+in Git, so importing the CVS branch symbol `RELENG-1_0-FINAL` would
+use just `RELENG-1_0-FINAL` for `<name>`, and gfi will write the
+corresponding ref as `refs/tags/RELENG-1_0-FINAL`.
+
+The value of `<name>` must be a valid refname in Git and therefore
+may contain forward slashes.  As `LF` is not valid in a Git refname,
+no quoting or escaping syntax is supported here.
+
+The `from` command is the same as in the `commit` command; see
+above for details.
+
+The `tagger` command uses the same format as `committer` within
+`commit`; again see above for details.
+
+The `data` command following `tagger` must supply the annotated tag
+message (see below for `data` command syntax).  To import an empty
+tag message use a 0 length data.  Tag messages are free-form and are
+not interpreted by Git.  Currently they must be encoded in UTF-8,
+as gfi does not permit other encodings to be specified.
+
+Signing annotated tags during import from within gfi is not
+supported.  Trying to include your own PGP/GPG signature is not
+recommended, as the frontend does not (easily) have access to the
+complete set of bytes which normally goes into such a signature.
+If signing is required, create lightweight tags from within gfi with
+`reset`, then create the annotated versions of those tags offline
+with the standard gitlink:git-tag[1] process.
+
+`reset`
+~~~~~~~
+Creates (or recreates) the named branch, optionally starting from
+a specific revision.  The reset command allows a frontend to issue
+a new `from` command for an existing branch, or to create a new
+branch from an existing commit without creating a new commit.
+
+....
+	'reset' SP <ref> LF
+	('from' SP <committish> LF)?
+	LF
+....
+
+For a detailed description of `<ref>` and `<committish>` see above
+under `commit` and `from`.
+
+The `reset` command can also be used to create lightweight
+(non-annotated) tags.  For example:
+
+====
+	reset refs/tags/938
+	from :938
+====
+
+would create the lightweight tag `refs/tags/938` referring to
+whatever commit mark `:938` references.
+
+`blob`
+~~~~~~
+Requests writing one file revision to the packfile.  The revision
+is not connected to any commit; this connection must be formed in
+a subsequent `commit` command by referencing the blob through an
+assigned mark.
+
+....
+	'blob' LF
+	mark?
+	data
+....
+
+The mark command is optional here as some frontends have chosen
+to generate the Git SHA-1 for the blob on their own, and feed that
+directly to `commit`.  This is typically more work than its worth
+however, as marks are inexpensive to store and easy to use.
+
+`data`
+~~~~~~
+Supplies raw data (for use as blob/file content, commit messages, or
+annotated tag messages) to gfi.  Data can be supplied using an exact
+byte count or delimited with a terminating line.  Real frontends
+intended for production-quality conversions should always use the
+exact byte count format, as it is more robust and performs better.
+The delimited format is intended primarily for testing gfi.
+
+Exact byte count format:
+
+....
+	'data' SP <count> LF
+	<raw> LF
+....
+
+where `<count>` is the exact number of bytes appearing within
+`<raw>`.  The value of `<count>` is expressed in base 10 notation
+using US-ASCII digits.  The `LF` on either side of `<raw>` is not
+included in `<count>` and will not be included in the imported data.
+
+Delimited format:
+
+....
+	'data' SP '<<' <delim> LF
+	<raw> LF
+	<delim> LF
+....
+
+where `<delim>` is the chosen delimiter string.  The string `<delim>`
+must not appear on a line by itself within `<raw>`, as otherwise
+gfi will think the data ends earlier than it really does.  The `LF`
+immediately trailing `<raw>` is part of `<raw>`.  This is one of
+the limitations of the delimited format, it is impossible to supply
+a data chunk which does not have an LF as its last byte.
+
+`checkpoint`
+~~~~~~~~~~~~
+Forces gfi to close the current packfile and start a new one.
+As this requires a significant amount of CPU time and disk IO
+(to compute the overall pack SHA-1 checksum and generate the
+corresponding index file) it can easily take several minutes for
+a single `checkpoint` command to complete.
+
+....
+	'checkpoint' LF
+	LF
+....
+
+Packfile Optimization
+---------------------
+When packing a blob gfi always attempts to deltify against the last
+blob written.  Unless specifically arranged for by the frontend,
+this will probably not be a prior version of the same file, so the
+generated delta will not be the smallest possible.  The resulting
+packfile will be compressed, but will not be optimal.
+
+Frontends which have efficient access to all revisions of a
+single file (for example reading an RCS/CVS ,v file) can choose
+to supply all revisions of that file as a sequence of consecutive
+`blob` commands.  This allows gfi to deltify the different file
+revisions against each other, saving space in the final packfile.
+Marks can be used to later identify individual file revisions during
+a sequence of `commit` commands.
+
+The packfile(s) created by gfi do not encourage good disk access
+patterns.  This is caused by gfi writing the data in the order
+it is received on standard input, while Git typically organizes
+data within packfiles to make the most recent (current tip) data
+appear before historical data.  Git also clusters commits together,
+speeding up revision traversal through better cache locality.
+
+For this reason it is strongly recommended that users repack the
+repository with `git repack -a -d` after gfi completes, allowing
+Git to reorganize the packfiles for faster data access.  If blob
+deltas are suboptimal (see above) then also adding the `-f` option
+to force recomputation of all deltas can significantly reduce the
+final packfile size (30-50% smaller can be quite typical).
+
+Memory Utilization
+------------------
+There are a number of factors which affect how much memory gfi
+requires to perform an import.  Like critical sections of core
+Git, gfi uses its own memory allocators to ammortize any overheads
+associated with malloc.  In practice gfi tends to ammoritize any
+malloc overheads to 0, due to its use of large block allocations.
+
+per object
+~~~~~~~~~~
+gfi maintains an in-memory structure for every object written in
+this execution.  On a 32 bit system the structure is 32 bytes,
+on a 64 bit system the structure is 40 bytes (due to the larger
+pointer sizes).  Objects in the table are not deallocated until
+gfi terminates.  Importing 2 million objects on a 32 bit system
+will require approximately 64 MiB of memory.
+
+The object table is actually a hashtable keyed on the object name
+(the unique SHA-1).  This storage configuration allows gfi to reuse
+an existing or already written object and avoid writing duplicates
+to the output packfile.  Duplicate blobs are surprisingly common
+in an import, typically due to branch merges in the source.
+
+per mark
+~~~~~~~~
+Marks are stored in a sparse array, using 1 pointer (4 bytes or 8
+bytes, depending on pointer size) per mark.  Although the array
+is sparse, frontends are still strongly encouraged to use marks
+between 1 and n, where n is the total number of marks required for
+this import.
+
+per branch
+~~~~~~~~~~
+Branches are classified as active and inactive.  The memory usage
+of the two classes is significantly different.
+
+Inactive branches are stored in a structure which uses 96 or 120
+bytes (32 bit or 64 bit systems, respectively), plus the length of
+the branch name (typically under 200 bytes), per branch.  gfi will
+easily handle as many as 10,000 inactive branches in under 2 MiB
+of memory.
+
+Active branches have the same overhead as inactive branches, but
+also contain copies of every tree that has been recently modified on
+that branch.  If subtree `include` has not been modified since the
+branch became active, its contents will not be loaded into memory,
+but if subtree `src` has been modified by a commit since the branch
+became active, then its contents will be loaded in memory.
+
+As active branches store metadata about the files contained on that
+branch, their in-memory storage size can grow to a considerable size
+(see below).
+
+gfi automatically moves active branches to inactive status based on
+a simple least-recently-used algorithm.  The LRU chain is updated on
+each `commit` command.  The maximum number of active branches can be
+increased or decreased on the command line with `--active-branches=`.
+
+per active tree
+~~~~~~~~~~~~~~~
+Trees (aka directories) use just 12 bytes of memory on top of the
+memory required for their entries (see ``per active file'' below).
+The cost of a tree is virtually 0, as its overhead ammortizes out
+over the individual file entries.
+
+per active file entry
+~~~~~~~~~~~~~~~~~~~~~
+Files (and pointers to subtrees) within active trees require 52 or 64
+bytes (32/64 bit platforms) per entry.  To conserve space, file and
+tree names are pooled in a common string table, allowing the filename
+``Makefile'' to use just 16 bytes (after including the string header
+overhead) no matter how many times it occurs within the project.
+
+The active branch LRU, when coupled with the filename string pool
+and lazy loading of subtrees, allows gfi to efficiently import
+projects with 2,000+ branches and 45,114+ files in a very limited
+memory footprint (less than 2.7 MiB per active branch).
+
+
+Author
+------
+Written by Shawn O. Pearce <spearce@spearce.org>.
+
+Documentation
+--------------
+Documentation by Shawn O. Pearce <spearce@spearce.org>.
+
+GIT
+---
+Part of the gitlink:git[7] suite
+

From 0b868e0240eb7f6d3dd137171b07f0860665b6e5 Mon Sep 17 00:00:00 2001
From: "Shawn O. Pearce" <spearce@spearce.org>
Date: Tue, 6 Feb 2007 00:15:37 -0500
Subject: [PATCH 73/81] Remove --branch-log from fast-import.

The --branch-log option and its associated code hasn't been used in
several months, as its not really very useful for debugging fast-import
or a frontend.  I don't plan on supporting it in this state long-term,
so I'm killing it now before it gets distributed to a wider audience.

Signed-off-by: Shawn O. Pearce <spearce@spearce.org>
---
 Documentation/git-fast-import.txt |  7 -------
 fast-import.c                     | 32 -------------------------------
 2 files changed, 39 deletions(-)

diff --git a/Documentation/git-fast-import.txt b/Documentation/git-fast-import.txt
index 16308731fb..aff3fba191 100644
--- a/Documentation/git-fast-import.txt
+++ b/Documentation/git-fast-import.txt
@@ -53,13 +53,6 @@ OPTIONS
 	Frontends can use this file to validate imports after they
 	have been completed.
 
---branch-log=<file>::
-	Records every tag and commit made to a log file.  (This file
-	can be quite verbose on large imports.)  This particular
-	option has been primarily intended to facilitate debugging
-	gfi and has limited usefulness in other contexts.  It may
-	be removed in future versions.
-
 
 Performance
 -----------
diff --git a/fast-import.c b/fast-import.c
index e6342386fc..c0ecd08dc1 100644
--- a/fast-import.c
+++ b/fast-import.c
@@ -297,7 +297,6 @@ static struct tag *last_tag;
 static struct strbuf command_buf;
 static uintmax_t next_mark;
 static struct dbuf new_data;
-static FILE* branch_log;
 
 
 static void alloc_objects(unsigned int cnt)
@@ -1730,18 +1729,6 @@ static void cmd_new_commit(void)
 		NULL, b->sha1, next_mark))
 		b->pack_id = pack_id;
 	b->last_commit = object_count_by_type[OBJ_COMMIT];
-
-	if (branch_log) {
-		int need_dq = quote_c_style(b->name, NULL, NULL, 0);
-		fprintf(branch_log, "commit ");
-		if (need_dq) {
-			fputc('"', branch_log);
-			quote_c_style(b->name, NULL, branch_log, 0);
-			fputc('"', branch_log);
-		} else
-			fprintf(branch_log, "%s", b->name);
-		fprintf(branch_log," :%ju %s\n",next_mark,sha1_to_hex(b->sha1));
-	}
 }
 
 static void cmd_new_tag(void)
@@ -1821,18 +1808,6 @@ static void cmd_new_tag(void)
 		t->pack_id = MAX_PACK_ID;
 	else
 		t->pack_id = pack_id;
-
-	if (branch_log) {
-		int need_dq = quote_c_style(t->name, NULL, NULL, 0);
-		fprintf(branch_log, "tag ");
-		if (need_dq) {
-			fputc('"', branch_log);
-			quote_c_style(t->name, NULL, branch_log, 0);
-			fputc('"', branch_log);
-		} else
-			fprintf(branch_log, "%s", t->name);
-		fprintf(branch_log," :%ju %s\n",from_mark,sha1_to_hex(t->sha1));
-	}
 }
 
 static void cmd_reset_branch(void)
@@ -1886,11 +1861,6 @@ int main(int argc, const char **argv)
 			max_active_branches = strtoul(a + 18, NULL, 0);
 		else if (!strncmp(a, "--export-marks=", 15))
 			mark_file = a + 15;
-		else if (!strncmp(a, "--branch-log=", 13)) {
-			branch_log = fopen(a + 13, "w");
-			if (!branch_log)
-				die("Can't create %s: %s", a + 13, strerror(errno));
-		}
 		else
 			die("unknown option %s", a);
 	}
@@ -1929,8 +1899,6 @@ int main(int argc, const char **argv)
 	dump_tags();
 	unkeep_all_packs();
 	dump_marks();
-	if (branch_log)
-		fclose(branch_log);
 
 	total_count = 0;
 	for (i = 0; i < ARRAY_SIZE(object_count_by_type); i++)

From 10e8d68820db9c7b6412d5a553a93fd200388f30 Mon Sep 17 00:00:00 2001
From: "Shawn O. Pearce" <spearce@spearce.org>
Date: Tue, 6 Feb 2007 00:26:49 -0500
Subject: [PATCH 74/81] Correct compiler warnings in fast-import.

Junio noticed these warnings/errors in fast-import when compiling
with `-Werror -ansi -pedantic`.  A few changes are to reduce compiler
warnings, while one (in cmd_merge) is a bug fix.

Signed-off-by: Shawn O. Pearce <spearce@spearce.org>
---
 fast-import.c | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/fast-import.c b/fast-import.c
index c0ecd08dc1..0f485c7828 100644
--- a/fast-import.c
+++ b/fast-import.c
@@ -1455,7 +1455,7 @@ static void file_change_m(struct branch *b)
 	const char *p = command_buf.buf + 2;
 	char *p_uq;
 	const char *endp;
-	struct object_entry *oe;
+	struct object_entry *oe = oe;
 	unsigned char sha1[20];
 	uint16_t mode, inline_data = 0;
 	char type[20];
@@ -1607,7 +1607,7 @@ static void cmd_from(struct branch *b)
 
 static struct hash_list* cmd_merge(unsigned int *count)
 {
-	struct hash_list *list = NULL, *n, *e;
+	struct hash_list *list = NULL, *n, *e = e;
 	const char *from;
 	struct branch *s;
 
@@ -1633,7 +1633,7 @@ static struct hash_list* cmd_merge(unsigned int *count)
 		else
 			list = n;
 		e = n;
-		*count++;
+		(*count)++;
 		read_next_command();
 	}
 	return list;
@@ -1763,8 +1763,9 @@ static void cmd_new_tag(void)
 	if (s) {
 		hashcpy(sha1, s->sha1);
 	} else if (*from == ':') {
+		struct object_entry *oe;
 		from_mark = strtoumax(from + 1, NULL, 10);
-		struct object_entry *oe = find_mark(from_mark);
+		oe = find_mark(from_mark);
 		if (oe->type != OBJ_COMMIT)
 			die("Mark :%ju not a commit", from_mark);
 		hashcpy(sha1, oe->sha1);

From e5b1444b96b842597348d449722ec876432a6530 Mon Sep 17 00:00:00 2001
From: "Shawn O. Pearce" <spearce@spearce.org>
Date: Tue, 6 Feb 2007 00:43:59 -0500
Subject: [PATCH 75/81] Correct minor style issue in fast-import.

Junio noticed that I was using a different style in fast-import
for returned pointers than the rest of Git.  Before merging this
code into the main git.git tree I'd like to make it consistent,
as this style variation was not intentional.

Signed-off-by: Shawn O. Pearce <spearce@spearce.org>
---
 fast-import.c | 34 +++++++++++++++++-----------------
 1 file changed, 17 insertions(+), 17 deletions(-)

diff --git a/fast-import.c b/fast-import.c
index 0f485c7828..c0cadc4279 100644
--- a/fast-import.c
+++ b/fast-import.c
@@ -312,7 +312,7 @@ static void alloc_objects(unsigned int cnt)
 	alloc_count += cnt;
 }
 
-static struct object_entry* new_object(unsigned char *sha1)
+static struct object_entry *new_object(unsigned char *sha1)
 {
 	struct object_entry *e;
 
@@ -324,7 +324,7 @@ static struct object_entry* new_object(unsigned char *sha1)
 	return e;
 }
 
-static struct object_entry* find_object(unsigned char *sha1)
+static struct object_entry *find_object(unsigned char *sha1)
 {
 	unsigned int h = sha1[0] << 8 | sha1[1];
 	struct object_entry *e;
@@ -334,7 +334,7 @@ static struct object_entry* find_object(unsigned char *sha1)
 	return NULL;
 }
 
-static struct object_entry* insert_object(unsigned char *sha1)
+static struct object_entry *insert_object(unsigned char *sha1)
 {
 	unsigned int h = sha1[0] << 8 | sha1[1];
 	struct object_entry *e = object_table[h];
@@ -365,7 +365,7 @@ static unsigned int hc_str(const char *s, size_t len)
 	return r;
 }
 
-static void* pool_alloc(size_t len)
+static void *pool_alloc(size_t len)
 {
 	struct mem_pool *p;
 	void *r;
@@ -395,7 +395,7 @@ static void* pool_alloc(size_t len)
 	return r;
 }
 
-static void* pool_calloc(size_t count, size_t size)
+static void *pool_calloc(size_t count, size_t size)
 {
 	size_t len = count * size;
 	void *r = pool_alloc(len);
@@ -403,7 +403,7 @@ static void* pool_calloc(size_t count, size_t size)
 	return r;
 }
 
-static char* pool_strdup(const char *s)
+static char *pool_strdup(const char *s)
 {
 	char *r = pool_alloc(strlen(s) + 1);
 	strcpy(r, s);
@@ -444,7 +444,7 @@ static void insert_mark(uintmax_t idnum, struct object_entry *oe)
 	s->data.marked[idnum] = oe;
 }
 
-static struct object_entry* find_mark(uintmax_t idnum)
+static struct object_entry *find_mark(uintmax_t idnum)
 {
 	uintmax_t orig_idnum = idnum;
 	struct mark_set *s = marks;
@@ -463,7 +463,7 @@ static struct object_entry* find_mark(uintmax_t idnum)
 	return oe;
 }
 
-static struct atom_str* to_atom(const char *s, unsigned short len)
+static struct atom_str *to_atom(const char *s, unsigned short len)
 {
 	unsigned int hc = hc_str(s, len) % atom_table_sz;
 	struct atom_str *c;
@@ -482,7 +482,7 @@ static struct atom_str* to_atom(const char *s, unsigned short len)
 	return c;
 }
 
-static struct branch* lookup_branch(const char *name)
+static struct branch *lookup_branch(const char *name)
 {
 	unsigned int hc = hc_str(name, strlen(name)) % branch_table_sz;
 	struct branch *b;
@@ -493,7 +493,7 @@ static struct branch* lookup_branch(const char *name)
 	return NULL;
 }
 
-static struct branch* new_branch(const char *name)
+static struct branch *new_branch(const char *name)
 {
 	unsigned int hc = hc_str(name, strlen(name)) % branch_table_sz;
 	struct branch* b = lookup_branch(name);
@@ -520,7 +520,7 @@ static unsigned int hc_entries(unsigned int cnt)
 	return cnt < avail_tree_table_sz ? cnt : avail_tree_table_sz - 1;
 }
 
-static struct tree_content* new_tree_content(unsigned int cnt)
+static struct tree_content *new_tree_content(unsigned int cnt)
 {
 	struct avail_tree_content *f, *l = NULL;
 	struct tree_content *t;
@@ -564,7 +564,7 @@ static void release_tree_content_recursive(struct tree_content *t)
 	release_tree_content(t);
 }
 
-static struct tree_content* grow_tree_content(
+static struct tree_content *grow_tree_content(
 	struct tree_content *t,
 	int amt)
 {
@@ -576,7 +576,7 @@ static struct tree_content* grow_tree_content(
 	return r;
 }
 
-static struct tree_entry* new_tree_entry(void)
+static struct tree_entry *new_tree_entry(void)
 {
 	struct tree_entry *e;
 
@@ -676,7 +676,7 @@ static int oecmp (const void *a_, const void *b_)
 	return hashcmp(a->sha1, b->sha1);
 }
 
-static char* create_index(void)
+static char *create_index(void)
 {
 	static char tmpfile[PATH_MAX];
 	SHA_CTX ctx;
@@ -732,7 +732,7 @@ static char* create_index(void)
 	return tmpfile;
 }
 
-static char* keep_pack(char *curr_index_name)
+static char *keep_pack(char *curr_index_name)
 {
 	static char name[PATH_MAX];
 	static char *keep_msg = "fast-import";
@@ -1344,7 +1344,7 @@ static void cmd_mark(void)
 		next_mark = 0;
 }
 
-static void* cmd_data (size_t *size)
+static void *cmd_data (size_t *size)
 {
 	size_t length;
 	char *buffer;
@@ -1605,7 +1605,7 @@ static void cmd_from(struct branch *b)
 	read_next_command();
 }
 
-static struct hash_list* cmd_merge(unsigned int *count)
+static struct hash_list *cmd_merge(unsigned int *count)
 {
 	struct hash_list *list = NULL, *n, *e = e;
 	const char *from;

From c74ba3d344769b827443b8995853527c85d63567 Mon Sep 17 00:00:00 2001
From: "Shawn O. Pearce" <spearce@spearce.org>
Date: Tue, 6 Feb 2007 11:59:11 -0500
Subject: [PATCH 76/81] Correct fast-import timezone documentation.

Andy Parkins and Linus Torvalds both noticed that the description
of the timezone was incorrect.  Its not expressed in minutes.
Its more like "hhmm", where "hh" is the number of hours and "mm"
is the number of minutes shifted from GMT/UTC.

Signed-off-by: Shawn O. Pearce <spearce@spearce.org>
---
 Documentation/git-fast-import.txt | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/Documentation/git-fast-import.txt b/Documentation/git-fast-import.txt
index aff3fba191..e204ea1b3e 100644
--- a/Documentation/git-fast-import.txt
+++ b/Documentation/git-fast-import.txt
@@ -226,8 +226,8 @@ The time of the change is specified by `<time>` as the number of
 seconds since the UNIX epoc (midnight, Jan 1, 1970, UTC) and is
 written in base-10 notation using US-ASCII digits.  The committer's
 timezone is specified by `<tz>` as a positive or negative offset
-from UTC, in minutes.  For example EST would be expressed in `<tz>`
-by ``-0500''.
+from UTC.  For example EST (which is typically 5 hours behind GMT)
+would be expressed in `<tz>` by ``-0500'' while GMT is ``+0000''.
 
 `from`
 ^^^^^^

From e7d06a4b702463f13371eed92ba790b8c527dd76 Mon Sep 17 00:00:00 2001
From: "Shawn O. Pearce" <spearce@spearce.org>
Date: Tue, 6 Feb 2007 12:05:51 -0500
Subject: [PATCH 77/81] Remove unnecessary null pointer checks in fast-import.

There is no need to check for a NULL pointer before invoking free(),
the runtime library automatically performs this check anyway and
does nothing if a NULL pointer is supplied.

Signed-off-by: Shawn O. Pearce <spearce@spearce.org>
---
 fast-import.c | 15 +++++----------
 1 file changed, 5 insertions(+), 10 deletions(-)

diff --git a/fast-import.c b/fast-import.c
index c0cadc4279..4dcba416e0 100644
--- a/fast-import.c
+++ b/fast-import.c
@@ -969,10 +969,9 @@ static int store_object(
 	pack_size += s.total_out;
 
 	free(out);
-	if (delta)
-		free(delta);
+	free(delta);
 	if (last) {
-		if (last->data && !last->no_free)
+		if (!last->no_free)
 			free(last->data);
 		last->data = dat;
 		last->offset = e->offset;
@@ -1521,9 +1520,7 @@ static void file_change_m(struct branch *b)
 	}
 
 	tree_content_set(&b->branch_tree, p, sha1, S_IFREG | mode);
-
-	if (p_uq)
-		free(p_uq);
+	free(p_uq);
 }
 
 static void file_change_d(struct branch *b)
@@ -1539,8 +1536,7 @@ static void file_change_d(struct branch *b)
 		p = p_uq;
 	}
 	tree_content_remove(&b->branch_tree, p);
-	if (p_uq)
-		free(p_uq);
+	free(p_uq);
 }
 
 static void cmd_from(struct branch *b)
@@ -1719,8 +1715,7 @@ static void cmd_new_commit(void)
 	sp += sprintf(sp, "%s\n\n", committer);
 	memcpy(sp, msg, msglen);
 	sp += msglen;
-	if (author)
-		free(author);
+	free(author);
 	free(committer);
 	free(msg);
 

From ef94edb53c9a5fd1e5fca9f548adc713d3d8ffe1 Mon Sep 17 00:00:00 2001
From: "Shawn O. Pearce" <spearce@spearce.org>
Date: Tue, 6 Feb 2007 12:35:02 -0500
Subject: [PATCH 78/81] Minor fast-import documentation corrections.

Corrected a couple of header markup lines which were shorter than the
actual header, and made the `data` commands two formats into a named
list, which matches how we document the two formats of the `M` command
within a commit.

Also tried to simplify the language about our decimal integer format;
Linus pointed out I was probably being too specific at the cost of
reduced readability.

Signed-off-by: Shawn O. Pearce <spearce@spearce.org>
---
 Documentation/git-fast-import.txt | 32 +++++++++++++++++--------------
 1 file changed, 18 insertions(+), 14 deletions(-)

diff --git a/Documentation/git-fast-import.txt b/Documentation/git-fast-import.txt
index e204ea1b3e..6fc78bff3e 100644
--- a/Documentation/git-fast-import.txt
+++ b/Documentation/git-fast-import.txt
@@ -224,7 +224,7 @@ the email address from the other fields in the line.  Note that
 
 The time of the change is specified by `<time>` as the number of
 seconds since the UNIX epoc (midnight, Jan 1, 1970, UTC) and is
-written in base-10 notation using US-ASCII digits.  The committer's
+written as an ASCII decimal integer.  The committer's
 timezone is specified by `<tz>` as a positive or negative offset
 from UTC.  For example EST (which is typically 5 hours behind GMT)
 would be expressed in `<tz>` by ``-0500'' while GMT is ``+0000''.
@@ -293,7 +293,7 @@ Here `<committish>` is any of the commit specification expressions
 also accepted by `from` (see above).
 
 `filemodify`
-^^^^^^^^^^
+^^^^^^^^^^^^
 Included in a `commit` command to add a new file or change the
 content of an existing file.  This command has two different means
 of specifying the content of the file.
@@ -351,9 +351,8 @@ The value of `<path>` must be in canoncial form. That is it must not:
 
 It is recommended that `<path>` always be encoded using UTF-8.
 
-
 `filedelete`
-^^^^^^^^^^
+^^^^^^^^^^^^
 Included in a `commit` command to remove a file from the branch.
 If the file removal makes its directory empty, the directory will
 be automatically removed too.  This cascades up the tree until the
@@ -379,8 +378,8 @@ command the `mark` command appears within.  This can be `commit`,
 ....
 
 where `<idnum>` is the number assigned by the frontend to this mark.
-The value of `<idnum>` is expressed in base 10 notation using
-US-ASCII digits.  The value 0 is reserved and cannot be used as
+The value of `<idnum>` is expressed as an ASCII decimal integer.
+The value 0 is reserved and cannot be used as
 a mark.  Only values greater than or equal to 1 may be used as marks.
 
 New marks are created automatically.  Existing marks can be moved
@@ -485,26 +484,31 @@ intended for production-quality conversions should always use the
 exact byte count format, as it is more robust and performs better.
 The delimited format is intended primarily for testing gfi.
 
-Exact byte count format:
-
+Exact byte count format::
+	The frontend must specify the number of bytes of data.
++
 ....
 	'data' SP <count> LF
 	<raw> LF
 ....
-
++
 where `<count>` is the exact number of bytes appearing within
-`<raw>`.  The value of `<count>` is expressed in base 10 notation
-using US-ASCII digits.  The `LF` on either side of `<raw>` is not
+`<raw>`.  The value of `<count>` is expressed as an ASCII decimal
+integer.  The `LF` on either side of `<raw>` is not
 included in `<count>` and will not be included in the imported data.
 
-Delimited format:
-
+Delimited format::
+	A delimiter string is used to mark the end of the data.
+	gfi will compute the length by searching for the delimiter.
+	This format is primarly useful for testing and is not
+	recommended for real data.
++
 ....
 	'data' SP '<<' <delim> LF
 	<raw> LF
 	<delim> LF
 ....
-
++
 where `<delim>` is the chosen delimiter string.  The string `<delim>`
 must not appear on a line by itself within `<raw>`, as otherwise
 gfi will think the data ends earlier than it really does.  The `LF`

From 63e0c8b364e334fc7cc975edf1f16fb4c89594b3 Mon Sep 17 00:00:00 2001
From: "Shawn O. Pearce" <spearce@spearce.org>
Date: Tue, 6 Feb 2007 14:58:30 -0500
Subject: [PATCH 79/81] Support RFC 2822 date parsing in fast-import.

Since some frontends may be working with source material where
the dates are only readily available as RFC 2822 strings, it is
more friendly if fast-import exposes Git's parse_date() function
to handle the conversion.  This way the frontend doesn't need
to perform the parsing itself.

The new --date-format option to fast-import can be used by a
frontend to select which format it will supply date strings in.
The default is the standard `raw` Git format, which fast-import
has always supported.  Format rfc2822 can be used to activate the
parse_date() function instead.

Because fast-import could also be useful for creating new, current
commits, the format `now` is also supported to generate the current
system timestamp.  The implementation of `now` is a trivial call
to datestamp(), but is actually a whole whopping 3 lines so that
fast-import can verify the frontend really meant `now`.

As part of this change I have added validation of the `raw` date
format.  Prior to this change fast-import would accept anything
in a `committer` command, even if it was seriously malformed.
Now fast-import requires the '> ' near the end of the string and
verifies the timestamp is formatted properly.

Signed-off-by: Shawn O. Pearce <spearce@spearce.org>
---
 Documentation/git-fast-import.txt |  95 +++++++++++++++++++++++---
 fast-import.c                     | 107 ++++++++++++++++++++++++++----
 t/t9300-fast-import.sh            |  36 ++++++++++
 3 files changed, 214 insertions(+), 24 deletions(-)

diff --git a/Documentation/git-fast-import.txt b/Documentation/git-fast-import.txt
index 6fc78bff3e..08450de9ac 100644
--- a/Documentation/git-fast-import.txt
+++ b/Documentation/git-fast-import.txt
@@ -32,6 +32,12 @@ the frontend program in use.
 
 OPTIONS
 -------
+--date-format=<fmt>::
+	Specify the type of dates the frontend will supply to
+	gfi within `author`, `committer` and `tagger` commands.
+	See ``Date Formats'' below for details about which formats
+	are supported, and their syntax.
+
 --max-pack-size=<n>::
 	Maximum size of each output packfile, expressed in MiB.
 	The default is 4096 (4 GiB) as that is the maximum allowed
@@ -53,7 +59,6 @@ OPTIONS
 	Frontends can use this file to validate imports after they
 	have been completed.
 
-
 Performance
 -----------
 The design of gfi allows it to import large projects in a minimum
@@ -127,6 +132,78 @@ results, such as branch names or file names with leading or trailing
 spaces in their name, or early termination of gfi when it encounters
 unexpected input.
 
+Date Formats
+~~~~~~~~~~~~
+The following date formats are supported.  A frontend should select
+the format it will use for this import by passing the format name
+in the `--date-format=<fmt>` command line option.
+
+`raw`::
+	This is the Git native format and is `<time> SP <tz>`.
+	It is also gfi's default format, if `--date-format` was
+	not specified.
++
+The time of the event is specified by `<time>` as the number of
+seconds since the UNIX epoch (midnight, Jan 1, 1970, UTC) and is
+written as an ASCII decimal integer.
++
+The timezone is specified by `<tz>` as a positive or negative offset
+from UTC.  For example EST (which is typically 5 hours behind GMT)
+would be expressed in `<tz>` by ``-0500'' while GMT is ``+0000''.
++
+If the timezone is not available in the source material, use
+``+0000'', or the most common local timezone.  For example many
+organizations have a CVS repository which has only ever been accessed
+by users who are located in the same location and timezone.  In this
+case the user's timezone can be easily assumed.
++
+Unlike the `rfc2822` format, this format is very strict.  Any
+variation in formatting will cause gfi to reject the value.
+
+`rfc2822`::
+	This is the standard email format as described by RFC 2822.
++
+An example value is ``Tue Feb 6 11:22:18 2007 -0500''.  The Git
+parser is accurate, but a little on the lenient side.  Its the
+same parser used by gitlink:git-am[1] when applying patches
+received from email.
++
+Some malformed strings may be accepted as valid dates.  In some of
+these cases Git will still be able to obtain the correct date from
+the malformed string.  There are also some types of malformed
+strings which Git will parse wrong, and yet consider valid.
+Seriously malformed strings will be rejected.
++
+If the source material is formatted in RFC 2822 style dates,
+the frontend should let gfi handle the parsing and conversion
+(rather than attempting to do it itself) as the Git parser has
+been well tested in the wild.
++
+Frontends should prefer the `raw` format if the source material
+is already in UNIX-epoch format, or is easily convertible to
+that format, as there is no ambiguity in parsing.
+
+`now`::
+	Always use the current time and timezone.  The literal
+	`now` must always be supplied for `<when>`.
++
+This is a toy format.  The current time and timezone of this system
+is always copied into the identity string at the time it is being
+created by gfi.  There is no way to specify a different time or
+timezone.
++
+This particular format is supplied as its short to implement and
+may be useful to a process that wants to create a new commit
+right now, without needing to use a working directory or
+gitlink:git-update-index[1].
++
+If separate `author` and `committer` commands are used in a `commit`
+the timestamps may not match, as the system clock will be polled
+twice (once for each command).  The only way to ensure that both
+author and committer identity information has the same timestamp
+is to omit `author` (thus copying from `committer`) or to use a
+date format other than `now`.
+
 Commands
 ~~~~~~~~
 gfi accepts several commands to update the current repository
@@ -168,8 +245,8 @@ change to the project.
 ....
 	'commit' SP <ref> LF
 	mark?
-	('author' SP <name> SP LT <email> GT SP <time> SP <tz> LF)?
-	'committer' SP <name> SP LT <email> GT SP <time> SP <tz> LF
+	('author' SP <name> SP LT <email> GT SP <when> LF)?
+	'committer' SP <name> SP LT <email> GT SP <when> LF
 	data
 	('from' SP <committish> LF)?
 	('merge' SP <committish> LF)?
@@ -222,12 +299,10 @@ the email address from the other fields in the line.  Note that
 `<name>` is free-form and may contain any sequence of bytes, except
 `LT` and `LF`.  It is typically UTF-8 encoded.
 
-The time of the change is specified by `<time>` as the number of
-seconds since the UNIX epoc (midnight, Jan 1, 1970, UTC) and is
-written as an ASCII decimal integer.  The committer's
-timezone is specified by `<tz>` as a positive or negative offset
-from UTC.  For example EST (which is typically 5 hours behind GMT)
-would be expressed in `<tz>` by ``-0500'' while GMT is ``+0000''.
+The time of the change is specified by `<when>` using the date format
+that was selected by the `--date-format=<fmt>` command line option.
+See ``Date Formats'' above for the set of supported formats, and
+their syntax.
 
 `from`
 ^^^^^^
@@ -394,7 +469,7 @@ lightweight (non-annotated) tags see the `reset` command below.
 ....
 	'tag' SP <name> LF
 	'from' SP <committish> LF
-	'tagger' SP <name> SP LT <email> GT SP <time> SP <tz> LF
+	'tagger' SP <name> SP LT <email> GT SP <when> LF
 	data
 	LF
 ....
diff --git a/fast-import.c b/fast-import.c
index 4dcba416e0..ee4777fdaf 100644
--- a/fast-import.c
+++ b/fast-import.c
@@ -17,8 +17,8 @@ Format of STDIN stream:
 
   new_commit ::= 'commit' sp ref_str lf
     mark?
-    ('author' sp name '<' email '>' ts tz lf)?
-    'committer' sp name '<' email '>' ts tz lf
+    ('author' sp name '<' email '>' when lf)?
+    'committer' sp name '<' email '>' when lf
     commit_msg
     ('from' sp (ref_str | hexsha1 | sha1exp_str | idnum) lf)?
     ('merge' sp (ref_str | hexsha1 | sha1exp_str | idnum) lf)*
@@ -34,7 +34,7 @@ Format of STDIN stream:
 
   new_tag ::= 'tag' sp tag_str lf
     'from' sp (ref_str | hexsha1 | sha1exp_str | idnum) lf
-	'tagger' sp name '<' email '>' ts tz lf
+	'tagger' sp name '<' email '>' when lf
     tag_msg;
   tag_msg ::= data;
 
@@ -88,6 +88,10 @@ Format of STDIN stream:
   bigint ::= # unsigned integer value, ascii base10 notation;
   binary_data ::= # file content, not interpreted;
 
+  when         ::= raw_when | rfc2822_when;
+  raw_when     ::= ts sp tz;
+  rfc2822_when ::= # Valid RFC 2822 date and time;
+
   sp ::= # ASCII space character;
   lf ::= # ASCII newline (LF) character;
 
@@ -234,6 +238,12 @@ struct hash_list
 	unsigned char sha1[20];
 };
 
+typedef enum {
+	WHENSPEC_RAW = 1,
+	WHENSPEC_RFC2822,
+	WHENSPEC_NOW,
+} whenspec_type;
+
 /* Configured limits on output */
 static unsigned long max_depth = 10;
 static unsigned long max_packsize = (1LL << 32) - 1;
@@ -294,6 +304,7 @@ static struct tag *first_tag;
 static struct tag *last_tag;
 
 /* Input stream parsing */
+static whenspec_type whenspec = WHENSPEC_RAW;
 static struct strbuf command_buf;
 static uintmax_t next_mark;
 static struct dbuf new_data;
@@ -1396,6 +1407,64 @@ static void *cmd_data (size_t *size)
 	return buffer;
 }
 
+static int validate_raw_date(const char *src, char *result, int maxlen)
+{
+	const char *orig_src = src;
+	char *endp, sign;
+
+	strtoul(src, &endp, 10);
+	if (endp == src || *endp != ' ')
+		return -1;
+
+	src = endp + 1;
+	if (*src != '-' && *src != '+')
+		return -1;
+	sign = *src;
+
+	strtoul(src + 1, &endp, 10);
+	if (endp == src || *endp || (endp - orig_src) >= maxlen)
+		return -1;
+
+	strcpy(result, orig_src);
+	return 0;
+}
+
+static char *parse_ident(const char *buf)
+{
+	const char *gt;
+	size_t name_len;
+	char *ident;
+
+	gt = strrchr(buf, '>');
+	if (!gt)
+		die("Missing > in ident string: %s", buf);
+	gt++;
+	if (*gt != ' ')
+		die("Missing space after > in ident string: %s", buf);
+	gt++;
+	name_len = gt - buf;
+	ident = xmalloc(name_len + 24);
+	strncpy(ident, buf, name_len);
+
+	switch (whenspec) {
+	case WHENSPEC_RAW:
+		if (validate_raw_date(gt, ident + name_len, 24) < 0)
+			die("Invalid raw date \"%s\" in ident: %s", gt, buf);
+		break;
+	case WHENSPEC_RFC2822:
+		if (parse_date(gt, ident + name_len, 24) < 0)
+			die("Invalid rfc2822 date \"%s\" in ident: %s", gt, buf);
+		break;
+	case WHENSPEC_NOW:
+		if (strcmp("now", gt))
+			die("Date in ident must be 'now': %s", buf);
+		datestamp(ident + name_len, 24);
+		break;
+	}
+
+	return ident;
+}
+
 static void cmd_new_blob(void)
 {
 	size_t l;
@@ -1655,11 +1724,11 @@ static void cmd_new_commit(void)
 	read_next_command();
 	cmd_mark();
 	if (!strncmp("author ", command_buf.buf, 7)) {
-		author = strdup(command_buf.buf);
+		author = parse_ident(command_buf.buf + 7);
 		read_next_command();
 	}
 	if (!strncmp("committer ", command_buf.buf, 10)) {
-		committer = strdup(command_buf.buf);
+		committer = parse_ident(command_buf.buf + 10);
 		read_next_command();
 	}
 	if (!committer)
@@ -1692,7 +1761,7 @@ static void cmd_new_commit(void)
 	store_tree(&b->branch_tree);
 	hashcpy(b->branch_tree.versions[0].sha1,
 		b->branch_tree.versions[1].sha1);
-	size_dbuf(&new_data, 97 + msglen
+	size_dbuf(&new_data, 114 + msglen
 		+ merge_count * 49
 		+ (author
 			? strlen(author) + strlen(committer)
@@ -1708,11 +1777,9 @@ static void cmd_new_commit(void)
 		free(merge_list);
 		merge_list = next;
 	}
-	if (author)
-		sp += sprintf(sp, "%s\n", author);
-	else
-		sp += sprintf(sp, "author %s\n", committer + 10);
-	sp += sprintf(sp, "%s\n\n", committer);
+	sp += sprintf(sp, "author %s\n", author ? author : committer);
+	sp += sprintf(sp, "committer %s\n", committer);
+	*sp++ = '\n';
 	memcpy(sp, msg, msglen);
 	sp += msglen;
 	free(author);
@@ -1780,7 +1847,7 @@ static void cmd_new_tag(void)
 	/* tagger ... */
 	if (strncmp("tagger ", command_buf.buf, 7))
 		die("Expected tagger command, got %s", command_buf.buf);
-	tagger = strdup(command_buf.buf);
+	tagger = parse_ident(command_buf.buf + 7);
 
 	/* tag payload/message */
 	read_next_command();
@@ -1792,7 +1859,8 @@ static void cmd_new_tag(void)
 	sp += sprintf(sp, "object %s\n", sha1_to_hex(sha1));
 	sp += sprintf(sp, "type %s\n", type_names[OBJ_COMMIT]);
 	sp += sprintf(sp, "tag %s\n", t->name);
-	sp += sprintf(sp, "%s\n\n", tagger);
+	sp += sprintf(sp, "tagger %s\n", tagger);
+	*sp++ = '\n';
 	memcpy(sp, msg, msglen);
 	sp += msglen;
 	free(tagger);
@@ -1835,7 +1903,7 @@ static void cmd_checkpoint(void)
 }
 
 static const char fast_import_usage[] =
-"git-fast-import [--depth=n] [--active-branches=n] [--export-marks=marks.file] [--branch-log=log]";
+"git-fast-import [--date-format=f] [--max-pack-size=n] [--depth=n] [--active-branches=n] [--export-marks=marks.file]";
 
 int main(int argc, const char **argv)
 {
@@ -1849,6 +1917,17 @@ int main(int argc, const char **argv)
 
 		if (*a != '-' || !strcmp(a, "--"))
 			break;
+		else if (!strncmp(a, "--date-format=", 14)) {
+			const char *fmt = a + 14;
+			if (!strcmp(fmt, "raw"))
+				whenspec = WHENSPEC_RAW;
+			else if (!strcmp(fmt, "rfc2822"))
+				whenspec = WHENSPEC_RFC2822;
+			else if (!strcmp(fmt, "now"))
+				whenspec = WHENSPEC_NOW;
+			else
+				die("unknown --date-format argument %s", fmt);
+		}
 		else if (!strncmp(a, "--max-pack-size=", 16))
 			max_packsize = strtoumax(a + 16, NULL, 0) * 1024 * 1024;
 		else if (!strncmp(a, "--depth=", 8))
diff --git a/t/t9300-fast-import.sh b/t/t9300-fast-import.sh
index a5cc846b34..84b3c12a50 100755
--- a/t/t9300-fast-import.sh
+++ b/t/t9300-fast-import.sh
@@ -240,4 +240,40 @@ test_expect_success \
 	'git-cat-file blob branch:newdir/exec.sh >actual &&
 	 diff -u expect actual'
 
+###
+### series E
+###
+
+cat >input <<INPUT_END
+commit refs/heads/branch
+author $GIT_AUTHOR_NAME <$GIT_AUTHOR_EMAIL> Tue Feb 6 11:22:18 2007 -0500
+committer $GIT_COMMITTER_NAME <$GIT_COMMITTER_EMAIL> Tue Feb 6 12:35:02 2007 -0500
+data <<COMMIT
+RFC 2822 type date
+COMMIT
+
+from refs/heads/branch^0
+
+INPUT_END
+test_expect_failure \
+    'E: rfc2822 date, --date-format=raw' \
+    'git-fast-import --date-format=raw <input'
+test_expect_success \
+    'E: rfc2822 date, --date-format=rfc2822' \
+    'git-fast-import --date-format=rfc2822 <input'
+test_expect_success \
+	'E: verify pack' \
+	'for p in .git/objects/pack/*.pack;do git-verify-pack $p||exit;done'
+
+cat >expect <<EOF
+author $GIT_AUTHOR_NAME <$GIT_AUTHOR_EMAIL> 1170778938 -0500
+committer $GIT_COMMITTER_NAME <$GIT_COMMITTER_EMAIL> 1170783302 -0500
+
+RFC 2822 type date
+EOF
+test_expect_success \
+	'E: verify commit' \
+	'git-cat-file commit branch | sed 1,2d >actual &&
+	diff -u expect actual'
+
 test_done

From 7073e69e382bc8247c28859d8b0eda2612cd6b94 Mon Sep 17 00:00:00 2001
From: "Shawn O. Pearce" <spearce@spearce.org>
Date: Tue, 6 Feb 2007 16:08:06 -0500
Subject: [PATCH 80/81] Don't do non-fastforward updates in fast-import.

If fast-import is being used to update an existing branch of
a repository, the user may not want to lose commits if another
process updates the same ref at the same time.  For example, the
user might be using fast-import to make just one or two commits
against a live branch.

We now perform a fast-forward check during the ref updating process.
If updating a branch would cause commits in that branch to be lost,
we skip over it and display the new SHA1 to standard error.

This new default behavior can be overridden with `--force`, like
git-push and git-fetch.

Signed-off-by: Shawn O. Pearce <spearce@spearce.org>
---
 Documentation/git-fast-import.txt | 22 +++++++--
 fast-import.c                     | 58 +++++++++++++++++-----
 t/t9300-fast-import.sh            | 80 +++++++++++++++++++++++++++++++
 3 files changed, 143 insertions(+), 17 deletions(-)

diff --git a/Documentation/git-fast-import.txt b/Documentation/git-fast-import.txt
index 08450de9ac..2be6c4b807 100644
--- a/Documentation/git-fast-import.txt
+++ b/Documentation/git-fast-import.txt
@@ -38,6 +38,11 @@ OPTIONS
 	See ``Date Formats'' below for details about which formats
 	are supported, and their syntax.
 
+--force::
+	Force updating modified existing branches, even if doing
+	so would cause commits to be lost (as the new commit does
+	not contain the old commit).
+
 --max-pack-size=<n>::
 	Maximum size of each output packfile, expressed in MiB.
 	The default is 4096 (4 GiB) as that is the maximum allowed
@@ -92,11 +97,18 @@ run alongside parallel `git repack -a -d` or `git gc` invocations,
 or any other Git operation (including `git prune`, as loose objects
 are never used by gfi).
 
-However, gfi does not lock the branch or tag refs it is actively
-importing.  After EOF, during its ref update phase, gfi blindly
-overwrites each imported branch or tag ref.  Consequently it is not
-safe to modify refs that are currently being used by a running gfi
-instance, as work could be lost when gfi overwrites the refs.
+gfi does not lock the branch or tag refs it is actively importing.
+After the import, during its ref update phase, gfi tests each
+existing branch ref to verify the update will be a fast-forward
+update (the commit stored in the ref is contained in the new
+history of the commit to be written).  If the update is not a
+fast-forward update, gfi will skip updating that ref and instead
+prints a warning message.  gfi will always attempt to update all
+branch refs, and does not stop on the first failure.
+
+Branch updates can be forced with `--force`, but its recommended that
+this only be used on an otherwise quiet repository.  Using `--force`
+is not necessary for an initial import into an empty repository.
 
 
 Technical Discussion
diff --git a/fast-import.c b/fast-import.c
index ee4777fdaf..df84e4d87d 100644
--- a/fast-import.c
+++ b/fast-import.c
@@ -121,6 +121,7 @@ Format of STDIN stream:
 #include "object.h"
 #include "blob.h"
 #include "tree.h"
+#include "commit.h"
 #include "delta.h"
 #include "pack.h"
 #include "refs.h"
@@ -247,6 +248,7 @@ typedef enum {
 /* Configured limits on output */
 static unsigned long max_depth = 10;
 static unsigned long max_packsize = (1LL << 32) - 1;
+static int force_update;
 
 /* Stats and misc. counters */
 static uintmax_t alloc_count;
@@ -257,6 +259,7 @@ static uintmax_t delta_count_by_type[1 << TYPE_BITS];
 static unsigned long object_count;
 static unsigned long branch_count;
 static unsigned long branch_load_count;
+static int failure;
 
 /* Memory pools */
 static size_t mem_pool_alloc = 2*1024*1024 - sizeof(struct mem_pool);
@@ -1278,19 +1281,48 @@ del_entry:
 	return 1;
 }
 
-static void dump_branches(void)
+static int update_branch(struct branch *b)
 {
 	static const char *msg = "fast-import";
+	struct ref_lock *lock;
+	unsigned char old_sha1[20];
+
+	if (read_ref(b->name, old_sha1))
+		hashclr(old_sha1);
+	lock = lock_any_ref_for_update(b->name, old_sha1);
+	if (!lock)
+		return error("Unable to lock %s", b->name);
+	if (!force_update && !is_null_sha1(old_sha1)) {
+		struct commit *old_cmit, *new_cmit;
+
+		old_cmit = lookup_commit_reference_gently(old_sha1, 0);
+		new_cmit = lookup_commit_reference_gently(b->sha1, 0);
+		if (!old_cmit || !new_cmit) {
+			unlock_ref(lock);
+			return error("Branch %s is missing commits.", b->name);
+		}
+
+		if (!in_merge_bases(old_cmit, new_cmit)) {
+			unlock_ref(lock);
+			warn("Not updating %s"
+				" (new tip %s does not contain %s)",
+				b->name, sha1_to_hex(b->sha1), sha1_to_hex(old_sha1));
+			return -1;
+		}
+	}
+	if (write_ref_sha1(lock, b->sha1, msg) < 0)
+		return error("Unable to update %s", b->name);
+	return 0;
+}
+
+static void dump_branches(void)
+{
 	unsigned int i;
 	struct branch *b;
-	struct ref_lock *lock;
 
 	for (i = 0; i < branch_table_sz; i++) {
-		for (b = branch_table[i]; b; b = b->table_next_branch) {
-			lock = lock_any_ref_for_update(b->name, NULL);
-			if (!lock || write_ref_sha1(lock, b->sha1, msg) < 0)
-				die("Can't write %s", b->name);
-		}
+		for (b = branch_table[i]; b; b = b->table_next_branch)
+			failure |= update_branch(b);
 	}
 }
 
@@ -1299,13 +1331,13 @@ static void dump_tags(void)
 	static const char *msg = "fast-import";
 	struct tag *t;
 	struct ref_lock *lock;
-	char path[PATH_MAX];
+	char ref_name[PATH_MAX];
 
 	for (t = first_tag; t; t = t->next_tag) {
-		sprintf(path, "refs/tags/%s", t->name);
-		lock = lock_any_ref_for_update(path, NULL);
+		sprintf(ref_name, "tags/%s", t->name);
+		lock = lock_ref_sha1(ref_name, NULL);
 		if (!lock || write_ref_sha1(lock, t->sha1, msg) < 0)
-			die("Can't write %s", path);
+			failure |= error("Unable to update %s", ref_name);
 	}
 }
 
@@ -1936,6 +1968,8 @@ int main(int argc, const char **argv)
 			max_active_branches = strtoul(a + 18, NULL, 0);
 		else if (!strncmp(a, "--export-marks=", 15))
 			mark_file = a + 15;
+		else if (!strcmp(a, "--force"))
+			force_update = 1;
 		else
 			die("unknown option %s", a);
 	}
@@ -2001,5 +2035,5 @@ int main(int argc, const char **argv)
 	fprintf(stderr, "---------------------------------------------------------------------\n");
 	fprintf(stderr, "\n");
 
-	return 0;
+	return failure ? 1 : 0;
 }
diff --git a/t/t9300-fast-import.sh b/t/t9300-fast-import.sh
index 84b3c12a50..23a2ba78f6 100755
--- a/t/t9300-fast-import.sh
+++ b/t/t9300-fast-import.sh
@@ -276,4 +276,84 @@ test_expect_success \
 	'git-cat-file commit branch | sed 1,2d >actual &&
 	diff -u expect actual'
 
+###
+### series F
+###
+
+old_branch=`git-rev-parse --verify branch^0`
+test_tick
+cat >input <<INPUT_END
+commit refs/heads/branch
+committer $GIT_COMMITTER_NAME <$GIT_COMMITTER_EMAIL> $GIT_COMMITTER_DATE
+data <<COMMIT
+losing things already?
+COMMIT
+
+from refs/heads/branch~1
+
+reset refs/heads/other
+from refs/heads/branch
+
+INPUT_END
+test_expect_success \
+    'F: non-fast-forward update skips' \
+    'if git-fast-import <input
+	 then
+		echo BAD gfi did not fail
+		return 1
+	 else
+		if test $old_branch = `git-rev-parse --verify branch^0`
+		then
+			: branch unaffected and failure returned
+			return 0
+		else
+			echo BAD gfi changed branch $old_branch
+			return 1
+		fi
+	 fi
+	'
+test_expect_success \
+	'F: verify pack' \
+	'for p in .git/objects/pack/*.pack;do git-verify-pack $p||exit;done'
+
+cat >expect <<EOF
+tree `git-rev-parse branch~1^{tree}`
+parent `git-rev-parse branch~1`
+author $GIT_COMMITTER_NAME <$GIT_COMMITTER_EMAIL> $GIT_COMMITTER_DATE
+committer $GIT_COMMITTER_NAME <$GIT_COMMITTER_EMAIL> $GIT_COMMITTER_DATE
+
+losing things already?
+EOF
+test_expect_success \
+	'F: verify other commit' \
+	'git-cat-file commit other >actual &&
+	diff -u expect actual'
+
+###
+### series G
+###
+
+old_branch=`git-rev-parse --verify branch^0`
+test_tick
+cat >input <<INPUT_END
+commit refs/heads/branch
+committer $GIT_COMMITTER_NAME <$GIT_COMMITTER_EMAIL> $GIT_COMMITTER_DATE
+data <<COMMIT
+losing things already?
+COMMIT
+
+from refs/heads/branch~1
+
+INPUT_END
+test_expect_success \
+    'G: non-fast-forward update forced' \
+    'git-fast-import --force <input'
+test_expect_success \
+	'G: verify pack' \
+	'for p in .git/objects/pack/*.pack;do git-verify-pack $p||exit;done'
+test_expect_success \
+	'G: branch changed, but logged' \
+	'test $old_branch != `git-rev-parse --verify branch^0` &&
+	 test $old_branch = `git-rev-parse --verify branch@{1}`'
+
 test_done

From 9981b6d915a49d325f790f2aa825aa56ae4ac85c Mon Sep 17 00:00:00 2001
From: Junio C Hamano <junkio@cox.net>
Date: Tue, 6 Feb 2007 12:46:11 -0800
Subject: [PATCH 81/81] S_IFLNK != 0140000

Signed-off-by: Shawn O. Pearce <spearce@spearce.org>
---
 Documentation/git-fast-import.txt | 2 +-
 fast-import.c                     | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/Documentation/git-fast-import.txt b/Documentation/git-fast-import.txt
index 2be6c4b807..1fe2c1dcf2 100644
--- a/Documentation/git-fast-import.txt
+++ b/Documentation/git-fast-import.txt
@@ -416,7 +416,7 @@ in octal.  Git only supports the following modes:
   of files in most projects use this mode.  If in doubt, this is
   what you want.
 * `100755` or `755`: A normal, but executable, file.
-* `140000`: A symlink, the content of the file will be the link target.
+* `120000`: A symlink, the content of the file will be the link target.
 
 In both formats `<path>` is the complete path of the file to be added
 (if not already existing) or modified (if already existing).
diff --git a/fast-import.c b/fast-import.c
index df84e4d87d..c72c5c7a94 100644
--- a/fast-import.c
+++ b/fast-import.c
@@ -81,7 +81,7 @@ Format of STDIN stream:
   path_str    ::= path    | '"' quoted(path)    '"' ;
   mode        ::= '100644' | '644'
                 | '100755' | '755'
-                | '140000'
+                | '120000'
                 ;
 
   declen ::= # unsigned 32 bit value, ascii base10 notation;