git/merge-ll.c

456 строки
12 KiB
C
Исходник Постоянная ссылка Обычный вид История

/*
* Low level 3-way in-core file merge.
*
* Copyright (c) 2007 Junio C Hamano
*/
#include "git-compat-util.h"
#include "config.h"
#include "convert.h"
#include "attr.h"
#include "xdiff-interface.h"
#include "run-command.h"
merge-ll: rename from ll-merge A long term (but rather minor) pet-peeve of mine was the name ll-merge.[ch]. I thought it made it harder to realize what stuff was related to merging when I was working on the merge machinery and trying to improve it. Further, back in d1cbe1e6d8a ("hash-ll.h: split out of hash.h to remove dependency on repository.h", 2023-04-22), we have split the portions of hash.h that do not depend upon repository.h into a "hash-ll.h" (due to the recommendation to use "ll" for "low-level" in its name[1], but which I used as a suffix precisely because of my distaste for "ll-merge"). When we discussed adding additional "*-ll.h" files, a request was made that we use "ll" consistently as either a prefix or a suffix. Since it is already in use as both a prefix and a suffix, the only way to do so is to rename some files. Besides my distaste for the ll-merge.[ch] name, let me also note that the files ll-fsmonitor.h, ll-hash.h, ll-merge.h, ll-object-store.h, ll-read-cache.h would have essentially nothing to do with each other and make no sense to group. But giving them the common "ll-" prefix would group them. Using "-ll" as a suffix thus seems just much more logical to me. Rename ll-merge.[ch] to merge-ll.[ch] to achieve this consistency, and to ensure we get a more logical grouping of files. [1] https://lore.kernel.org/git/kl6lsfcu1g8w.fsf@chooglen-macbookpro.roam.corp.google.com/ Signed-off-by: Elijah Newren <newren@gmail.com> Signed-off-by: Junio C Hamano <gitster@pobox.com>
2023-05-16 09:34:04 +03:00
#include "merge-ll.h"
#include "quote.h"
#include "strbuf.h"
struct ll_merge_driver;
typedef enum ll_merge_result (*ll_merge_fn)(const struct ll_merge_driver *,
mmbuffer_t *result,
const char *path,
mmfile_t *orig, const char *orig_name,
mmfile_t *src1, const char *name1,
mmfile_t *src2, const char *name2,
const struct ll_merge_options *opts,
int marker_size);
struct ll_merge_driver {
const char *name;
const char *description;
ll_merge_fn fn;
const char *recursive;
struct ll_merge_driver *next;
char *cmdline;
};
static struct attr_check *merge_attributes;
static struct attr_check *load_merge_attributes(void)
{
if (!merge_attributes)
merge_attributes = attr_check_initl("merge", "conflict-marker-size", NULL);
return merge_attributes;
}
void reset_merge_attributes(void)
{
attr_check_free(merge_attributes);
merge_attributes = NULL;
}
/*
* Built-in low-levels
*/
static enum ll_merge_result ll_binary_merge(const struct ll_merge_driver *drv UNUSED,
mmbuffer_t *result,
const char *path UNUSED,
mmfile_t *orig, const char *orig_name UNUSED,
mmfile_t *src1, const char *name1 UNUSED,
mmfile_t *src2, const char *name2 UNUSED,
const struct ll_merge_options *opts,
int marker_size UNUSED)
{
enum ll_merge_result ret;
mmfile_t *stolen;
assert(opts);
/*
* The tentative merge result is the common ancestor for an
* internal merge. For the final merge, it is "ours" by
* default but -Xours/-Xtheirs can tweak the choice.
*/
if (opts->virtual_ancestor) {
stolen = orig;
ret = LL_MERGE_OK;
} else {
switch (opts->variant) {
default:
ret = LL_MERGE_BINARY_CONFLICT;
stolen = src1;
break;
case XDL_MERGE_FAVOR_OURS:
ret = LL_MERGE_OK;
stolen = src1;
break;
case XDL_MERGE_FAVOR_THEIRS:
ret = LL_MERGE_OK;
stolen = src2;
break;
}
}
result->ptr = stolen->ptr;
result->size = stolen->size;
stolen->ptr = NULL;
return ret;
}
static enum ll_merge_result ll_xdl_merge(const struct ll_merge_driver *drv_unused,
mmbuffer_t *result,
const char *path,
mmfile_t *orig, const char *orig_name,
mmfile_t *src1, const char *name1,
mmfile_t *src2, const char *name2,
const struct ll_merge_options *opts,
int marker_size)
{
enum ll_merge_result ret;
xmparam_t xmp;
int status;
assert(opts);
xdiff: reject files larger than ~1GB The xdiff code is not prepared to handle extremely large files. It uses "int" in many places, which can overflow if we have a very large number of lines or even bytes in our input files. This can cause us to produce incorrect diffs, with no indication that the output is wrong. Or worse, we may even underallocate a buffer whose size is the result of an overflowing addition. We're much better off to tell the user that we cannot diff or merge such a large file. This patch covers both cases, but in slightly different ways: 1. For merging, we notice the large file and cleanly fall back to a binary merge (which is effectively "we cannot merge this"). 2. For diffing, we make the binary/text distinction much earlier, and in many different places. For this case, we'll use the xdi_diff as our choke point, and reject any diff there before it hits the xdiff code. This means in most cases we'll die() immediately after. That's not ideal, but in practice we shouldn't generally hit this code path unless the user is trying to do something tricky. We already consider files larger than core.bigfilethreshold to be binary, so this code would only kick in when that is circumvented (either by bumping that value, or by using a .gitattribute to mark a file as diffable). In other words, we can avoid being "nice" here, because there is already nice code that tries to do the right thing. We are adding the suspenders to the nice code's belt, so notice when it has been worked around (both to protect the user from malicious inputs, and because it is better to die() than generate bogus output). The maximum size was chosen after experimenting with feeding large files to the xdiff code. It's just under a gigabyte, which leaves room for two obvious cases: - a diff3 merge conflict result on files of maximum size X could be 3*X plus the size of the markers, which would still be only about 3G, which fits in a 32-bit int. - some of the diff code allocates arrays of one int per record. Even if each file consists only of blank lines, then a file smaller than 1G will have fewer than 1G records, and therefore the int array will fit in 4G. Since the limit is arbitrary anyway, I chose to go under a gigabyte, to leave a safety margin (e.g., we would not want to overflow by allocating "(records + 1) * sizeof(int)" or similar. Signed-off-by: Jeff King <peff@peff.net> Signed-off-by: Junio C Hamano <gitster@pobox.com>
2015-09-25 02:12:45 +03:00
if (orig->size > MAX_XDIFF_SIZE ||
src1->size > MAX_XDIFF_SIZE ||
src2->size > MAX_XDIFF_SIZE ||
buffer_is_binary(orig->ptr, orig->size) ||
buffer_is_binary(src1->ptr, src1->size) ||
buffer_is_binary(src2->ptr, src2->size)) {
return ll_binary_merge(drv_unused, result,
path,
orig, orig_name,
src1, name1,
src2, name2,
opts, marker_size);
}
memset(&xmp, 0, sizeof(xmp));
xmp.level = XDL_MERGE_ZEALOUS;
xmp.favor = opts->variant;
xmp.xpp.flags = opts->xdl_opts;
if (opts->conflict_style >= 0)
xmp.style = opts->conflict_style;
else if (git_xmerge_style >= 0)
xmp.style = git_xmerge_style;
if (marker_size > 0)
xmp.marker_size = marker_size;
xmp.ancestor = orig_name;
xmp.file1 = name1;
xmp.file2 = name2;
status = xdl_merge(orig, src1, src2, &xmp, result);
ret = (status > 0) ? LL_MERGE_CONFLICT : status;
return ret;
}
static enum ll_merge_result ll_union_merge(const struct ll_merge_driver *drv_unused,
mmbuffer_t *result,
const char *path,
mmfile_t *orig, const char *orig_name,
mmfile_t *src1, const char *name1,
mmfile_t *src2, const char *name2,
const struct ll_merge_options *opts,
int marker_size)
{
/* Use union favor */
struct ll_merge_options o;
assert(opts);
o = *opts;
o.variant = XDL_MERGE_FAVOR_UNION;
return ll_xdl_merge(drv_unused, result, path,
ll_union_merge(): pass name labels to ll_xdl_merge() Since cd1d61c44f (make union merge an xdl merge favor, 2010-03-01), we pass NULL to ll_xdl_merge() for the "name" labels of the ancestor, ours and theirs buffers. We usually use these for annotating conflict markers left in a file. For a union merge, these shouldn't matter; the point of it is that we'd never leave conflict markers in the first place. But there is one code path where we may dereference them: if the file contents appear to be binary, ll_binary_merge() will give up and pass them to warning() to generate a message for the user (that was true even when cd1d61c44f was written, though the warning was in ll_xdl_merge() back then). That can result in a segfault, though on many systems (including glibc), the printf routines will helpfully just say "(null)" instead. We can extend our binary-union test in t6406 to check stderr, which catches the problem on all systems. This also fixes a warning from "gcc -O3". Unlike lower optimization levels, it inlines enough to see that the NULL can make it to warning() and complains: In function ‘ll_binary_merge’, inlined from ‘ll_xdl_merge’ at ll-merge.c:115:10, inlined from ‘ll_union_merge’ at ll-merge.c:151:9: ll-merge.c:74:4: warning: ‘%s’ directive argument is null [-Wformat-overflow=] 74 | warning("Cannot merge binary files: %s (%s vs. %s)", | ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 75 | path, name1, name2); | ~~~~~~~~~~~~~~~~~~~ Signed-off-by: Jeff King <peff@peff.net> Signed-off-by: Junio C Hamano <gitster@pobox.com>
2021-06-10 15:58:43 +03:00
orig, orig_name, src1, name1, src2, name2,
&o, marker_size);
}
#define LL_BINARY_MERGE 0
#define LL_TEXT_MERGE 1
#define LL_UNION_MERGE 2
static struct ll_merge_driver ll_merge_drv[] = {
{ "binary", "built-in binary merge", ll_binary_merge },
{ "text", "built-in 3-way text merge", ll_xdl_merge },
{ "union", "built-in union merge", ll_union_merge },
};
static void create_temp(mmfile_t *src, char *path, size_t len)
{
int fd;
xsnprintf(path, len, ".merge_file_XXXXXX");
fd = xmkstemp(path);
avoid "write_in_full(fd, buf, len) != len" pattern The return value of write_in_full() is either "-1", or the requested number of bytes[1]. If we make a partial write before seeing an error, we still return -1, not a partial value. This goes back to f6aa66cb95 (write_in_full: really write in full or return error on disk full., 2007-01-11). So checking anything except "was the return value negative" is pointless. And there are a couple of reasons not to do so: 1. It can do a funny signed/unsigned comparison. If your "len" is signed (e.g., a size_t) then the compiler will promote the "-1" to its unsigned variant. This works out for "!= len" (unless you really were trying to write the maximum size_t bytes), but is a bug if you check "< len" (an example of which was fixed recently in config.c). We should avoid promoting the mental model that you need to check the length at all, so that new sites are not tempted to copy us. 2. Checking for a negative value is shorter to type, especially when the length is an expression. 3. Linus says so. In d34cf19b89 (Clean up write_in_full() users, 2007-01-11), right after the write_in_full() semantics were changed, he wrote: I really wish every "write_in_full()" user would just check against "<0" now, but this fixes the nasty and stupid ones. Appeals to authority aside, this makes it clear that writing it this way does not have an intentional benefit. It's a historical curiosity that we never bothered to clean up (and which was undoubtedly cargo-culted into new sites). So let's convert these obviously-correct cases (this includes write_str_in_full(), which is just a wrapper for write_in_full()). [1] A careful reader may notice there is one way that write_in_full() can return a different value. If we ask write() to write N bytes and get a return value that is _larger_ than N, we could return a larger total. But besides the fact that this would imply a totally broken version of write(), it would already invoke undefined behavior. Our internal remaining counter is an unsigned size_t, which means that subtracting too many byte will wrap it around to a very large number. So we'll instantly begin reading off the end of the buffer, trying to write gigabytes (or petabytes) of data. Signed-off-by: Jeff King <peff@peff.net> Reviewed-by: Jonathan Nieder <jrnieder@gmail.com> Signed-off-by: Junio C Hamano <gitster@pobox.com>
2017-09-13 20:16:03 +03:00
if (write_in_full(fd, src->ptr, src->size) < 0)
die_errno("unable to write temp-file");
close(fd);
}
/*
* User defined low-level merge driver support.
*/
static enum ll_merge_result ll_ext_merge(const struct ll_merge_driver *fn,
mmbuffer_t *result,
const char *path,
mmfile_t *orig, const char *orig_name,
mmfile_t *src1, const char *name1,
mmfile_t *src2, const char *name2,
const struct ll_merge_options *opts,
int marker_size)
{
char temp[3][50];
struct strbuf cmd = STRBUF_INIT;
const char *format = fn->cmdline;
struct child_process child = CHILD_PROCESS_INIT;
int status, fd, i;
struct stat st;
enum ll_merge_result ret;
assert(opts);
if (!fn->cmdline)
die("custom merge driver %s lacks command line.", fn->name);
result->ptr = NULL;
result->size = 0;
create_temp(orig, temp[0], sizeof(temp[0]));
create_temp(src1, temp[1], sizeof(temp[1]));
create_temp(src2, temp[2], sizeof(temp[2]));
while (strbuf_expand_step(&cmd, &format)) {
if (skip_prefix(format, "%", &format))
strbuf_addch(&cmd, '%');
else if (skip_prefix(format, "O", &format))
strbuf_addstr(&cmd, temp[0]);
else if (skip_prefix(format, "A", &format))
strbuf_addstr(&cmd, temp[1]);
else if (skip_prefix(format, "B", &format))
strbuf_addstr(&cmd, temp[2]);
else if (skip_prefix(format, "L", &format))
strbuf_addf(&cmd, "%d", marker_size);
else if (skip_prefix(format, "P", &format))
sq_quote_buf(&cmd, path);
else if (skip_prefix(format, "S", &format))
sq_quote_buf(&cmd, orig_name ? orig_name : "");
else if (skip_prefix(format, "X", &format))
sq_quote_buf(&cmd, name1 ? name1 : "");
else if (skip_prefix(format, "Y", &format))
sq_quote_buf(&cmd, name2 ? name2 : "");
else
strbuf_addch(&cmd, '%');
}
child.use_shell = 1;
strvec_push(&child.args, cmd.buf);
status = run_command(&child);
fd = open(temp[1], O_RDONLY);
if (fd < 0)
goto bad;
if (fstat(fd, &st))
goto close_bad;
result->size = st.st_size;
result->ptr = xmallocz(result->size);
if (read_in_full(fd, result->ptr, result->size) != result->size) {
FREE_AND_NULL(result->ptr);
result->size = 0;
}
close_bad:
close(fd);
bad:
for (i = 0; i < 3; i++)
unlink_or_warn(temp[i]);
strbuf_release(&cmd);
if (!status)
ret = LL_MERGE_OK;
else if (status <= 128)
ret = LL_MERGE_CONFLICT;
else
/* died due to a signal: WTERMSIG(status) + 128 */
ret = LL_MERGE_ERROR;
return ret;
}
/*
* merge.default and merge.driver configuration items
*/
static struct ll_merge_driver *ll_user_merge, **ll_user_merge_tail;
static const char *default_ll_merge;
static int read_merge_config(const char *var, const char *value,
config: add ctx arg to config_fn_t Add a new "const struct config_context *ctx" arg to config_fn_t to hold additional information about the config iteration operation. config_context has a "struct key_value_info kvi" member that holds metadata about the config source being read (e.g. what kind of config source it is, the filename, etc). In this series, we're only interested in .kvi, so we could have just used "struct key_value_info" as an arg, but config_context makes it possible to add/adjust members in the future without changing the config_fn_t signature. We could also consider other ways of organizing the args (e.g. moving the config name and value into config_context or key_value_info), but in my experiments, the incremental benefit doesn't justify the added complexity (e.g. a config_fn_t will sometimes invoke another config_fn_t but with a different config value). In subsequent commits, the .kvi member will replace the global "struct config_reader" in config.c, making config iteration a global-free operation. It requires much more work for the machinery to provide meaningful values of .kvi, so for now, merely change the signature and call sites, pass NULL as a placeholder value, and don't rely on the arg in any meaningful way. Most of the changes are performed by contrib/coccinelle/config_fn_ctx.pending.cocci, which, for every config_fn_t: - Modifies the signature to accept "const struct config_context *ctx" - Passes "ctx" to any inner config_fn_t, if needed - Adds UNUSED attributes to "ctx", if needed Most config_fn_t instances are easily identified by seeing if they are called by the various config functions. Most of the remaining ones are manually named in the .cocci patch. Manual cleanups are still needed, but the majority of it is trivial; it's either adjusting config_fn_t that the .cocci patch didn't catch, or adding forward declarations of "struct config_context ctx" to make the signatures make sense. The non-trivial changes are in cases where we are invoking a config_fn_t outside of config machinery, and we now need to decide what value of "ctx" to pass. These cases are: - trace2/tr2_cfg.c:tr2_cfg_set_fl() This is indirectly called by git_config_set() so that the trace2 machinery can notice the new config values and update its settings using the tr2 config parsing function, i.e. tr2_cfg_cb(). - builtin/checkout.c:checkout_main() This calls git_xmerge_config() as a shorthand for parsing a CLI arg. This might be worth refactoring away in the future, since git_xmerge_config() can call git_default_config(), which can do much more than just parsing. Handle them by creating a KVI_INIT macro that initializes "struct key_value_info" to a reasonable default, and use that to construct the "ctx" arg. Signed-off-by: Glen Choo <chooglen@google.com> Signed-off-by: Junio C Hamano <gitster@pobox.com>
2023-06-28 22:26:22 +03:00
const struct config_context *ctx UNUSED,
void *cb UNUSED)
{
struct ll_merge_driver *fn;
const char *key, *name;
size_t namelen;
if (!strcmp(var, "merge.default"))
return git_config_string(&default_ll_merge, var, value);
/*
* We are not interested in anything but "merge.<name>.variable";
* especially, we do not want to look at variables such as
* "merge.summary", "merge.tool", and "merge.verbosity".
*/
if (parse_config_key(var, "merge", &name, &namelen, &key) < 0 || !name)
return 0;
/*
* Find existing one as we might be processing merge.<name>.var2
* after seeing merge.<name>.var1.
*/
for (fn = ll_user_merge; fn; fn = fn->next)
if (!xstrncmpz(fn->name, name, namelen))
break;
if (!fn) {
CALLOC_ARRAY(fn, 1);
fn->name = xmemdupz(name, namelen);
fn->fn = ll_ext_merge;
*ll_user_merge_tail = fn;
ll_user_merge_tail = &(fn->next);
}
if (!strcmp("name", key))
return git_config_string(&fn->description, var, value);
if (!strcmp("driver", key)) {
if (!value)
return config_error_nonbool(var);
/*
* merge.<name>.driver specifies the command line:
*
* command-line
*
* The command-line will be interpolated with the following
* tokens and is given to the shell:
*
* %O - temporary file name for the merge base.
* %A - temporary file name for our version.
* %B - temporary file name for the other branches' version.
* %L - conflict marker length
* %P - the original path (safely quoted for the shell)
* %S - the revision for the merge base
* %X - the revision for our version
* %Y - the revision for their version
*
* If the file is not named indentically in all versions, then each
* revision is joined with the corresponding path, separated by a colon.
* The external merge driver should write the results in the
* file named by %A, and signal that it has done with zero exit
* status.
*/
fn->cmdline = xstrdup(value);
return 0;
}
if (!strcmp("recursive", key))
return git_config_string(&fn->recursive, var, value);
return 0;
}
static void initialize_ll_merge(void)
{
if (ll_user_merge_tail)
return;
ll_user_merge_tail = &ll_user_merge;
git_config(read_merge_config, NULL);
}
static const struct ll_merge_driver *find_ll_merge_driver(const char *merge_attr)
{
struct ll_merge_driver *fn;
const char *name;
int i;
initialize_ll_merge();
if (ATTR_TRUE(merge_attr))
return &ll_merge_drv[LL_TEXT_MERGE];
else if (ATTR_FALSE(merge_attr))
return &ll_merge_drv[LL_BINARY_MERGE];
else if (ATTR_UNSET(merge_attr)) {
if (!default_ll_merge)
return &ll_merge_drv[LL_TEXT_MERGE];
else
name = default_ll_merge;
}
else
name = merge_attr;
for (fn = ll_user_merge; fn; fn = fn->next)
if (!strcmp(fn->name, name))
return fn;
for (i = 0; i < ARRAY_SIZE(ll_merge_drv); i++)
if (!strcmp(ll_merge_drv[i].name, name))
return &ll_merge_drv[i];
/* default to the 3-way */
return &ll_merge_drv[LL_TEXT_MERGE];
}
static void normalize_file(mmfile_t *mm, const char *path, struct index_state *istate)
{
struct strbuf strbuf = STRBUF_INIT;
if (renormalize_buffer(istate, path, mm->ptr, mm->size, &strbuf)) {
free(mm->ptr);
mm->size = strbuf.len;
mm->ptr = strbuf_detach(&strbuf, NULL);
}
}
enum ll_merge_result ll_merge(mmbuffer_t *result_buf,
const char *path,
mmfile_t *ancestor, const char *ancestor_label,
mmfile_t *ours, const char *our_label,
mmfile_t *theirs, const char *their_label,
struct index_state *istate,
const struct ll_merge_options *opts)
{
struct attr_check *check = load_merge_attributes();
static const struct ll_merge_options default_opts = LL_MERGE_OPTIONS_INIT;
const char *ll_driver_name = NULL;
int marker_size = DEFAULT_CONFLICT_MARKER_SIZE;
const struct ll_merge_driver *driver;
if (!opts)
opts = &default_opts;
if (opts->renormalize) {
normalize_file(ancestor, path, istate);
normalize_file(ours, path, istate);
normalize_file(theirs, path, istate);
}
git_check_attr(istate, path, check);
ll_driver_name = check->items[0].value;
if (check->items[1].value) {
marker_size = atoi(check->items[1].value);
if (marker_size <= 0)
marker_size = DEFAULT_CONFLICT_MARKER_SIZE;
}
driver = find_ll_merge_driver(ll_driver_name);
if (opts->virtual_ancestor) {
if (driver->recursive)
driver = find_ll_merge_driver(driver->recursive);
merge-recursive: increase marker length with depth of recursion Later patches in this series will modify file collision conflict handling (e.g. from rename/add and rename/rename(2to1) conflicts) so that multiply nested conflict markers can arise even before considering conflicts in the virtual merge base. Including the virtual merge base will provide a way to get triply (or higher) nested conflict markers. This new way to get nested conflict markers will force the need for a more general mechanism to extend the length of conflict markers in order to differentiate between different nestings. Along with this change to conflict marker length handling, we want to make sure that we don't regress handling for other types of conflicts with nested conflict markers. Add a more involved testcase using merge.conflictstyle=diff3, where not only does the virtual merge base contain conflicts, but its virtual merge base does as well (i.e. a case with triply nested conflict markers). While there are multiple reasonable ways to handle nested conflict markers in the virtual merge base for this type of situation, the easiest approach that dovetails well with the new needs for the file collision conflict handling is to require that the length of the conflict markers increase with each subsequent nesting. Subsequent patches which change the rename/add and rename/rename(2to1) conflict handling will modify the extra_marker_size flag appropriately for their new needs. Signed-off-by: Elijah Newren <newren@gmail.com> Signed-off-by: Junio C Hamano <gitster@pobox.com>
2018-11-08 07:40:24 +03:00
}
if (opts->extra_marker_size) {
marker_size += opts->extra_marker_size;
}
return driver->fn(driver, result_buf, path, ancestor, ancestor_label,
ours, our_label, theirs, their_label,
opts, marker_size);
}
int ll_merge_marker_size(struct index_state *istate, const char *path)
{
static struct attr_check *check;
int marker_size = DEFAULT_CONFLICT_MARKER_SIZE;
if (!check)
check = attr_check_initl("conflict-marker-size", NULL);
git_check_attr(istate, path, check);
if (check->items[0].value) {
marker_size = atoi(check->items[0].value);
if (marker_size <= 0)
marker_size = DEFAULT_CONFLICT_MARKER_SIZE;
}
return marker_size;
}