diff --git a/Documentation/gitattributes.txt b/Documentation/gitattributes.txt index d892e642ed..a8500d1772 100644 --- a/Documentation/gitattributes.txt +++ b/Documentation/gitattributes.txt @@ -414,6 +414,26 @@ because it quickly conveys the changes you have made), you should generate it separately and send it as a comment _in addition to_ the usual binary diff that you might send. +Because text conversion can be slow, especially when doing a +large number of them with `git log -p`, git provides a mechanism +to cache the output and use it in future diffs. To enable +caching, set the "cachetextconv" variable in your diff driver's +config. For example: + +------------------------ +[diff "jpg"] + textconv = exif + cachetextconv = true +------------------------ + +This will cache the result of running "exif" on each blob +indefinitely. If you change the textconv config variable for a +diff driver, git will automatically invalidate the cache entries +and re-run the textconv filter. If you want to invalidate the +cache manually (e.g., because your version of "exif" was updated +and now produces better output), you can remove the cache +manually with `git update-ref -d refs/notes/textconv/jpg` (where +"jpg" is the name of the diff driver, as in the example above). Performing a three-way merge ~~~~~~~~~~~~~~~~~~~~~~~~~~~~ diff --git a/Makefile b/Makefile index eb1d1623c2..4f7224a59a 100644 --- a/Makefile +++ b/Makefile @@ -486,6 +486,7 @@ LIB_H += log-tree.h LIB_H += mailmap.h LIB_H += merge-recursive.h LIB_H += notes.h +LIB_H += notes-cache.h LIB_H += object.h LIB_H += pack.h LIB_H += pack-refs.h @@ -575,6 +576,7 @@ LIB_OBJS += merge-file.o LIB_OBJS += merge-recursive.o LIB_OBJS += name-hash.o LIB_OBJS += notes.o +LIB_OBJS += notes-cache.o LIB_OBJS += object.o LIB_OBJS += pack-check.o LIB_OBJS += pack-refs.o diff --git a/builtin.h b/builtin.h index 464588b299..5c887ef61b 100644 --- a/builtin.h +++ b/builtin.h @@ -16,9 +16,6 @@ extern const char *help_unknown_cmd(const char *cmd); extern void prune_packed_objects(int); extern int fmt_merge_msg(int merge_summary, struct strbuf *in, struct strbuf *out); -extern int commit_tree(const char *msg, unsigned char *tree, - struct commit_list *parents, unsigned char *ret, - const char *author); extern int commit_notes(struct notes_tree *t, const char *msg); struct notes_rewrite_cfg { diff --git a/builtin/commit-tree.c b/builtin/commit-tree.c index 90dac349a3..87f0591c2f 100644 --- a/builtin/commit-tree.c +++ b/builtin/commit-tree.c @@ -9,19 +9,6 @@ #include "builtin.h" #include "utf8.h" -/* - * FIXME! Share the code with "write-tree.c" - */ -static void check_valid(unsigned char *sha1, enum object_type expect) -{ - enum object_type type = sha1_object_info(sha1, NULL); - if (type < 0) - die("%s is not a valid object", sha1_to_hex(sha1)); - if (type != expect) - die("%s is not a valid '%s' object", sha1_to_hex(sha1), - typename(expect)); -} - static const char commit_tree_usage[] = "git commit-tree [-p ]* < changelog"; static void new_parent(struct commit *parent, struct commit_list **parents_p) @@ -38,61 +25,6 @@ static void new_parent(struct commit *parent, struct commit_list **parents_p) commit_list_insert(parent, parents_p); } -static const char commit_utf8_warn[] = -"Warning: commit message does not conform to UTF-8.\n" -"You may want to amend it after fixing the message, or set the config\n" -"variable i18n.commitencoding to the encoding your project uses.\n"; - -int commit_tree(const char *msg, unsigned char *tree, - struct commit_list *parents, unsigned char *ret, - const char *author) -{ - int result; - int encoding_is_utf8; - struct strbuf buffer; - - check_valid(tree, OBJ_TREE); - - /* Not having i18n.commitencoding is the same as having utf-8 */ - encoding_is_utf8 = is_encoding_utf8(git_commit_encoding); - - strbuf_init(&buffer, 8192); /* should avoid reallocs for the headers */ - strbuf_addf(&buffer, "tree %s\n", sha1_to_hex(tree)); - - /* - * NOTE! This ordering means that the same exact tree merged with a - * different order of parents will be a _different_ changeset even - * if everything else stays the same. - */ - while (parents) { - struct commit_list *next = parents->next; - strbuf_addf(&buffer, "parent %s\n", - sha1_to_hex(parents->item->object.sha1)); - free(parents); - parents = next; - } - - /* Person/date information */ - if (!author) - author = git_author_info(IDENT_ERROR_ON_NO_NAME); - strbuf_addf(&buffer, "author %s\n", author); - strbuf_addf(&buffer, "committer %s\n", git_committer_info(IDENT_ERROR_ON_NO_NAME)); - if (!encoding_is_utf8) - strbuf_addf(&buffer, "encoding %s\n", git_commit_encoding); - strbuf_addch(&buffer, '\n'); - - /* And add the comment */ - strbuf_addstr(&buffer, msg); - - /* And check the encoding */ - if (encoding_is_utf8 && !is_utf8(buffer.buf)) - fprintf(stderr, commit_utf8_warn); - - result = write_sha1_file(buffer.buf, buffer.len, commit_type, ret); - strbuf_release(&buffer); - return result; -} - int cmd_commit_tree(int argc, const char **argv, const char *prefix) { int i; @@ -117,7 +49,7 @@ int cmd_commit_tree(int argc, const char **argv, const char *prefix) if (get_sha1(b, sha1)) die("Not a valid object name %s", b); - check_valid(sha1, OBJ_COMMIT); + assert_sha1_type(sha1, OBJ_COMMIT); new_parent(lookup_commit(sha1), &parents); } diff --git a/cache.h b/cache.h index 5eb0573bcc..bfc4d82f6c 100644 --- a/cache.h +++ b/cache.h @@ -718,6 +718,8 @@ extern int has_loose_object_nonlocal(const unsigned char *sha1); extern int has_pack_index(const unsigned char *sha1); +extern void assert_sha1_type(const unsigned char *sha1, enum object_type expect); + extern const signed char hexval_table[256]; static inline unsigned int hexval(unsigned char c) { diff --git a/commit.c b/commit.c index 731191e63b..e9b0750967 100644 --- a/commit.c +++ b/commit.c @@ -790,3 +790,58 @@ struct commit_list *reduce_heads(struct commit_list *heads) free(other); return result; } + +static const char commit_utf8_warn[] = +"Warning: commit message does not conform to UTF-8.\n" +"You may want to amend it after fixing the message, or set the config\n" +"variable i18n.commitencoding to the encoding your project uses.\n"; + +int commit_tree(const char *msg, unsigned char *tree, + struct commit_list *parents, unsigned char *ret, + const char *author) +{ + int result; + int encoding_is_utf8; + struct strbuf buffer; + + assert_sha1_type(tree, OBJ_TREE); + + /* Not having i18n.commitencoding is the same as having utf-8 */ + encoding_is_utf8 = is_encoding_utf8(git_commit_encoding); + + strbuf_init(&buffer, 8192); /* should avoid reallocs for the headers */ + strbuf_addf(&buffer, "tree %s\n", sha1_to_hex(tree)); + + /* + * NOTE! This ordering means that the same exact tree merged with a + * different order of parents will be a _different_ changeset even + * if everything else stays the same. + */ + while (parents) { + struct commit_list *next = parents->next; + strbuf_addf(&buffer, "parent %s\n", + sha1_to_hex(parents->item->object.sha1)); + free(parents); + parents = next; + } + + /* Person/date information */ + if (!author) + author = git_author_info(IDENT_ERROR_ON_NO_NAME); + strbuf_addf(&buffer, "author %s\n", author); + strbuf_addf(&buffer, "committer %s\n", git_committer_info(IDENT_ERROR_ON_NO_NAME)); + if (!encoding_is_utf8) + strbuf_addf(&buffer, "encoding %s\n", git_commit_encoding); + strbuf_addch(&buffer, '\n'); + + /* And add the comment */ + strbuf_addstr(&buffer, msg); + + /* And check the encoding */ + if (encoding_is_utf8 && !is_utf8(buffer.buf)) + fprintf(stderr, commit_utf8_warn); + + result = write_sha1_file(buffer.buf, buffer.len, commit_type, ret); + strbuf_release(&buffer); + return result; +} diff --git a/commit.h b/commit.h index 26ec8c0d1c..6ef88dcf45 100644 --- a/commit.h +++ b/commit.h @@ -163,4 +163,8 @@ static inline int single_parent(struct commit *commit) struct commit_list *reduce_heads(struct commit_list *heads); +extern int commit_tree(const char *msg, unsigned char *tree, + struct commit_list *parents, unsigned char *ret, + const char *author); + #endif /* COMMIT_H */ diff --git a/diff.c b/diff.c index e40c1271da..e49f14a924 100644 --- a/diff.c +++ b/diff.c @@ -44,7 +44,8 @@ static char diff_colors[][COLOR_MAXLEN] = { }; static void diff_filespec_load_driver(struct diff_filespec *one); -static char *run_textconv(const char *, struct diff_filespec *, size_t *); +static size_t fill_textconv(struct userdiff_driver *driver, + struct diff_filespec *df, char **outbuf); static int parse_diff_color_slot(const char *var, int ofs) { @@ -466,8 +467,8 @@ static void emit_rewrite_diff(const char *name_a, const char *name_b, struct diff_filespec *one, struct diff_filespec *two, - const char *textconv_one, - const char *textconv_two, + struct userdiff_driver *textconv_one, + struct userdiff_driver *textconv_two, struct diff_options *o) { int lc_a, lc_b; @@ -478,7 +479,7 @@ static void emit_rewrite_diff(const char *name_a, const char *reset = diff_get_color(color_diff, DIFF_RESET); static struct strbuf a_name = STRBUF_INIT, b_name = STRBUF_INIT; const char *a_prefix, *b_prefix; - const char *data_one, *data_two; + char *data_one, *data_two; size_t size_one, size_two; struct emit_callback ecbdata; @@ -500,26 +501,8 @@ static void emit_rewrite_diff(const char *name_a, quote_two_c_style(&a_name, a_prefix, name_a, 0); quote_two_c_style(&b_name, b_prefix, name_b, 0); - diff_populate_filespec(one, 0); - diff_populate_filespec(two, 0); - if (textconv_one) { - data_one = run_textconv(textconv_one, one, &size_one); - if (!data_one) - die("unable to read files to diff"); - } - else { - data_one = one->data; - size_one = one->size; - } - if (textconv_two) { - data_two = run_textconv(textconv_two, two, &size_two); - if (!data_two) - die("unable to read files to diff"); - } - else { - data_two = two->data; - size_two = two->size; - } + size_one = fill_textconv(textconv_one, one, &data_one); + size_two = fill_textconv(textconv_two, two, &data_two); memset(&ecbdata, 0, sizeof(ecbdata)); ecbdata.color_diff = color_diff; @@ -1585,14 +1568,26 @@ void diff_set_mnemonic_prefix(struct diff_options *options, const char *a, const options->b_prefix = b; } -static const char *get_textconv(struct diff_filespec *one) +static struct userdiff_driver *get_textconv(struct diff_filespec *one) { if (!DIFF_FILE_VALID(one)) return NULL; if (!S_ISREG(one->mode)) return NULL; diff_filespec_load_driver(one); - return one->driver->textconv; + if (!one->driver->textconv) + return NULL; + + if (one->driver->textconv_want_cache && !one->driver->textconv_cache) { + struct notes_cache *c = xmalloc(sizeof(*c)); + struct strbuf name = STRBUF_INIT; + + strbuf_addf(&name, "textconv/%s", one->driver->name); + notes_cache_init(c, name.buf, one->driver->textconv); + one->driver->textconv_cache = c; + } + + return one->driver; } static void builtin_diff(const char *name_a, @@ -1609,7 +1604,8 @@ static void builtin_diff(const char *name_a, const char *set = diff_get_color_opt(o, DIFF_METAINFO); const char *reset = diff_get_color_opt(o, DIFF_RESET); const char *a_prefix, *b_prefix; - const char *textconv_one = NULL, *textconv_two = NULL; + struct userdiff_driver *textconv_one = NULL; + struct userdiff_driver *textconv_two = NULL; struct strbuf header = STRBUF_INIT; if (DIFF_OPT_TST(o, SUBMODULE_LOG) && @@ -1683,12 +1679,11 @@ static void builtin_diff(const char *name_a, } } - if (fill_mmfile(&mf1, one) < 0 || fill_mmfile(&mf2, two) < 0) - die("unable to read files to diff"); - if (!DIFF_OPT_TST(o, TEXT) && - ( (diff_filespec_is_binary(one) && !textconv_one) || - (diff_filespec_is_binary(two) && !textconv_two) )) { + ( (!textconv_one && diff_filespec_is_binary(one)) || + (!textconv_two && diff_filespec_is_binary(two)) )) { + if (fill_mmfile(&mf1, one) < 0 || fill_mmfile(&mf2, two) < 0) + die("unable to read files to diff"); /* Quite common confusing case */ if (mf1.size == mf2.size && !memcmp(mf1.ptr, mf2.ptr, mf1.size)) @@ -1715,20 +1710,8 @@ static void builtin_diff(const char *name_a, strbuf_reset(&header); } - if (textconv_one) { - size_t size; - mf1.ptr = run_textconv(textconv_one, one, &size); - if (!mf1.ptr) - die("unable to read files to diff"); - mf1.size = size; - } - if (textconv_two) { - size_t size; - mf2.ptr = run_textconv(textconv_two, two, &size); - if (!mf2.ptr) - die("unable to read files to diff"); - mf2.size = size; - } + mf1.size = fill_textconv(textconv_one, one, &mf1.ptr); + mf2.size = fill_textconv(textconv_two, two, &mf2.ptr); pe = diff_funcname_pattern(one); if (!pe) @@ -3912,3 +3895,47 @@ static char *run_textconv(const char *pgm, struct diff_filespec *spec, return strbuf_detach(&buf, outsize); } + +static size_t fill_textconv(struct userdiff_driver *driver, + struct diff_filespec *df, + char **outbuf) +{ + size_t size; + + if (!driver || !driver->textconv) { + if (!DIFF_FILE_VALID(df)) { + *outbuf = ""; + return 0; + } + if (diff_populate_filespec(df, 0)) + die("unable to read files to diff"); + *outbuf = df->data; + return df->size; + } + + if (driver->textconv_cache) { + *outbuf = notes_cache_get(driver->textconv_cache, df->sha1, + &size); + if (*outbuf) + return size; + } + + *outbuf = run_textconv(driver->textconv, df, &size); + if (!*outbuf) + die("unable to read files to diff"); + + if (driver->textconv_cache) { + /* ignore errors, as we might be in a readonly repository */ + notes_cache_put(driver->textconv_cache, df->sha1, *outbuf, + size); + /* + * we could save up changes and flush them all at the end, + * but we would need an extra call after all diffing is done. + * Since generating a cache entry is the slow path anyway, + * this extra overhead probably isn't a big deal. + */ + notes_cache_write(driver->textconv_cache); + } + + return size; +} diff --git a/notes-cache.c b/notes-cache.c new file mode 100644 index 0000000000..dee6d62e72 --- /dev/null +++ b/notes-cache.c @@ -0,0 +1,94 @@ +#include "cache.h" +#include "notes-cache.h" +#include "commit.h" +#include "refs.h" + +static int notes_cache_match_validity(const char *ref, const char *validity) +{ + unsigned char sha1[20]; + struct commit *commit; + struct pretty_print_context pretty_ctx; + struct strbuf msg = STRBUF_INIT; + int ret; + + if (read_ref(ref, sha1) < 0) + return 0; + + commit = lookup_commit_reference_gently(sha1, 1); + if (!commit) + return 0; + + memset(&pretty_ctx, 0, sizeof(pretty_ctx)); + format_commit_message(commit, "%s", &msg, &pretty_ctx); + strbuf_trim(&msg); + + ret = !strcmp(msg.buf, validity); + strbuf_release(&msg); + + return ret; +} + +void notes_cache_init(struct notes_cache *c, const char *name, + const char *validity) +{ + struct strbuf ref = STRBUF_INIT; + int flags = 0; + + memset(c, 0, sizeof(*c)); + c->validity = xstrdup(validity); + + strbuf_addf(&ref, "refs/notes/%s", name); + if (!notes_cache_match_validity(ref.buf, validity)) + flags = NOTES_INIT_EMPTY; + init_notes(&c->tree, ref.buf, combine_notes_overwrite, flags); + strbuf_release(&ref); +} + +int notes_cache_write(struct notes_cache *c) +{ + unsigned char tree_sha1[20]; + unsigned char commit_sha1[20]; + + if (!c || !c->tree.initialized || !c->tree.ref || !*c->tree.ref) + return -1; + if (!c->tree.dirty) + return 0; + + if (write_notes_tree(&c->tree, tree_sha1)) + return -1; + if (commit_tree(c->validity, tree_sha1, NULL, commit_sha1, NULL) < 0) + return -1; + if (update_ref("update notes cache", c->tree.ref, commit_sha1, NULL, + 0, QUIET_ON_ERR) < 0) + return -1; + + return 0; +} + +char *notes_cache_get(struct notes_cache *c, unsigned char key_sha1[20], + size_t *outsize) +{ + const unsigned char *value_sha1; + enum object_type type; + char *value; + unsigned long size; + + value_sha1 = get_note(&c->tree, key_sha1); + if (!value_sha1) + return NULL; + value = read_sha1_file(value_sha1, &type, &size); + + *outsize = size; + return value; +} + +int notes_cache_put(struct notes_cache *c, unsigned char key_sha1[20], + const char *data, size_t size) +{ + unsigned char value_sha1[20]; + + if (write_sha1_file(data, size, "blob", value_sha1) < 0) + return -1; + add_note(&c->tree, key_sha1, value_sha1, NULL); + return 0; +} diff --git a/notes-cache.h b/notes-cache.h new file mode 100644 index 0000000000..356f88fb3c --- /dev/null +++ b/notes-cache.h @@ -0,0 +1,20 @@ +#ifndef NOTES_CACHE_H +#define NOTES_CACHE_H + +#include "notes.h" + +struct notes_cache { + struct notes_tree tree; + char *validity; +}; + +void notes_cache_init(struct notes_cache *c, const char *name, + const char *validity); +int notes_cache_write(struct notes_cache *c); + +char *notes_cache_get(struct notes_cache *c, unsigned char sha1[20], size_t + *outsize); +int notes_cache_put(struct notes_cache *c, unsigned char sha1[20], + const char *data, size_t size); + +#endif /* NOTES_CACHE_H */ diff --git a/sha1_file.c b/sha1_file.c index ff65328006..28c056e074 100644 --- a/sha1_file.c +++ b/sha1_file.c @@ -2516,3 +2516,13 @@ int read_pack_header(int fd, struct pack_header *header) return PH_ERROR_PROTOCOL; return 0; } + +void assert_sha1_type(const unsigned char *sha1, enum object_type expect) +{ + enum object_type type = sha1_object_info(sha1, NULL); + if (type < 0) + die("%s is not a valid object", sha1_to_hex(sha1)); + if (type != expect) + die("%s is not a valid '%s' object", sha1_to_hex(sha1), + typename(expect)); +} diff --git a/t/t4042-diff-textconv-caching.sh b/t/t4042-diff-textconv-caching.sh new file mode 100755 index 0000000000..91f8198f05 --- /dev/null +++ b/t/t4042-diff-textconv-caching.sh @@ -0,0 +1,109 @@ +#!/bin/sh + +test_description='test textconv caching' +. ./test-lib.sh + +cat >helper <<'EOF' +#!/bin/sh +sed 's/^/converted: /' "$@" >helper.out +cat helper.out +EOF +chmod +x helper + +test_expect_success 'setup' ' + echo foo content 1 >foo.bin && + echo bar content 1 >bar.bin && + git add . && + git commit -m one && + echo foo content 2 >foo.bin && + echo bar content 2 >bar.bin && + git commit -a -m two && + echo "*.bin diff=magic" >.gitattributes && + git config diff.magic.textconv ./helper && + git config diff.magic.cachetextconv true +' + +cat >expect <actual && + test_cmp expect actual +' + +test_expect_success 'cached textconv produces same output' ' + git diff HEAD^ HEAD >actual && + test_cmp expect actual +' + +test_expect_success 'cached textconv does not run helper' ' + rm -f helper.out && + git diff HEAD^ HEAD >actual && + test_cmp expect actual && + ! test -r helper.out +' + +cat >expect <other && + git config diff.magic.textconv "./helper other" && + git diff HEAD^ HEAD >actual && + test_cmp expect actual +' + +cat >expect <>.gitattributes && + git diff HEAD^ HEAD >actual && + test_cmp expect actual +' + +test_done diff --git a/userdiff.c b/userdiff.c index df992490d5..67003fbb23 100644 --- a/userdiff.c +++ b/userdiff.c @@ -1,3 +1,4 @@ +#include "cache.h" #include "userdiff.h" #include "cache.h" #include "attr.h" @@ -167,6 +168,12 @@ static int parse_tristate(int *b, const char *k, const char *v) return 1; } +static int parse_bool(int *b, const char *k, const char *v) +{ + *b = git_config_bool(k, v); + return 1; +} + int userdiff_config(const char *k, const char *v) { struct userdiff_driver *drv; @@ -181,6 +188,8 @@ int userdiff_config(const char *k, const char *v) return parse_string(&drv->external, k, v); if ((drv = parse_driver(k, v, "textconv"))) return parse_string(&drv->textconv, k, v); + if ((drv = parse_driver(k, v, "cachetextconv"))) + return parse_bool(&drv->textconv_want_cache, k, v); if ((drv = parse_driver(k, v, "wordregex"))) return parse_string(&drv->word_regex, k, v); diff --git a/userdiff.h b/userdiff.h index c3151594f5..942d594950 100644 --- a/userdiff.h +++ b/userdiff.h @@ -1,6 +1,8 @@ #ifndef USERDIFF_H #define USERDIFF_H +#include "notes-cache.h" + struct userdiff_funcname { const char *pattern; int cflags; @@ -13,6 +15,8 @@ struct userdiff_driver { struct userdiff_funcname funcname; const char *word_regex; const char *textconv; + struct notes_cache *textconv_cache; + int textconv_want_cache; }; int userdiff_config(const char *k, const char *v);