2005-05-21 13:39:09 +04:00
|
|
|
/*
|
|
|
|
* Copyright (C) 2005 Junio C Hamano
|
|
|
|
*/
|
|
|
|
#include "cache.h"
|
|
|
|
#include "diff.h"
|
|
|
|
#include "diffcore.h"
|
2007-10-25 22:23:26 +04:00
|
|
|
#include "hash.h"
|
2011-02-20 12:51:16 +03:00
|
|
|
#include "progress.h"
|
2005-05-21 13:39:09 +04:00
|
|
|
|
2005-05-24 12:10:48 +04:00
|
|
|
/* Table of rename/copy destinations */
|
|
|
|
|
|
|
|
static struct diff_rename_dst {
|
|
|
|
struct diff_filespec *two;
|
|
|
|
struct diff_filepair *pair;
|
|
|
|
} *rename_dst;
|
|
|
|
static int rename_dst_nr, rename_dst_alloc;
|
2005-05-21 13:39:09 +04:00
|
|
|
|
2005-05-24 12:10:48 +04:00
|
|
|
static struct diff_rename_dst *locate_rename_dst(struct diff_filespec *two,
|
|
|
|
int insert_ok)
|
2005-05-21 13:39:09 +04:00
|
|
|
{
|
2005-05-24 12:10:48 +04:00
|
|
|
int first, last;
|
|
|
|
|
|
|
|
first = 0;
|
|
|
|
last = rename_dst_nr;
|
|
|
|
while (last > first) {
|
|
|
|
int next = (last + first) >> 1;
|
|
|
|
struct diff_rename_dst *dst = &(rename_dst[next]);
|
|
|
|
int cmp = strcmp(two->path, dst->two->path);
|
|
|
|
if (!cmp)
|
|
|
|
return dst;
|
|
|
|
if (cmp < 0) {
|
|
|
|
last = next;
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
first = next+1;
|
|
|
|
}
|
|
|
|
/* not found */
|
|
|
|
if (!insert_ok)
|
|
|
|
return NULL;
|
|
|
|
/* insert to make it at "first" */
|
|
|
|
if (rename_dst_alloc <= rename_dst_nr) {
|
|
|
|
rename_dst_alloc = alloc_nr(rename_dst_alloc);
|
|
|
|
rename_dst = xrealloc(rename_dst,
|
|
|
|
rename_dst_alloc * sizeof(*rename_dst));
|
|
|
|
}
|
|
|
|
rename_dst_nr++;
|
|
|
|
if (first < rename_dst_nr)
|
|
|
|
memmove(rename_dst + first + 1, rename_dst + first,
|
|
|
|
(rename_dst_nr - first - 1) * sizeof(*rename_dst));
|
2005-09-16 03:13:43 +04:00
|
|
|
rename_dst[first].two = alloc_filespec(two->path);
|
diff: do not use null sha1 as a sentinel value
The diff code represents paths using the diff_filespec
struct. This struct has a sha1 to represent the sha1 of the
content at that path, as well as a sha1_valid member which
indicates whether its sha1 field is actually useful. If
sha1_valid is not true, then the filespec represents a
working tree file (e.g., for the no-index case, or for when
the index is not up-to-date).
The diff_filespec is only used internally, though. At the
interfaces to the diff subsystem, callers feed the sha1
directly, and we create a diff_filespec from it. It's at
that point that we look at the sha1 and decide whether it is
valid or not; callers may pass the null sha1 as a sentinel
value to indicate that it is not.
We should not typically see the null sha1 coming from any
other source (e.g., in the index itself, or from a tree).
However, a corrupt tree might have a null sha1, which would
cause "diff --patch" to accidentally diff the working tree
version of a file instead of treating it as a blob.
This patch extends the edges of the diff interface to accept
a "sha1_valid" flag whenever we accept a sha1, and to use
that flag when creating a filespec. In some cases, this
means passing the flag through several layers, making the
code change larger than would be desirable.
One alternative would be to simply die() upon seeing
corrupted trees with null sha1s. However, this fix more
directly addresses the problem (while bogus sha1s in a tree
are probably a bad thing, it is really the sentinel
confusion sending us down the wrong code path that is what
makes it devastating). And it means that git is more capable
of examining and debugging these corrupted trees. For
example, you can still "diff --raw" such a tree to find out
when the bogus entry was introduced; you just cannot do a
"--patch" diff (just as you could not with any other
corrupted tree, as we do not have any content to diff).
Signed-off-by: Jeff King <peff@peff.net>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
2012-07-28 19:03:01 +04:00
|
|
|
fill_filespec(rename_dst[first].two, two->sha1, two->sha1_valid, two->mode);
|
2005-05-24 12:10:48 +04:00
|
|
|
rename_dst[first].pair = NULL;
|
|
|
|
return &(rename_dst[first]);
|
2005-05-21 13:39:09 +04:00
|
|
|
}
|
|
|
|
|
2005-05-28 02:55:55 +04:00
|
|
|
/* Table of rename/copy src files */
|
2005-05-24 12:10:48 +04:00
|
|
|
static struct diff_rename_src {
|
2011-01-07 00:50:05 +03:00
|
|
|
struct diff_filepair *p;
|
2006-04-09 07:17:46 +04:00
|
|
|
unsigned short score; /* to remember the break score */
|
2005-05-24 12:10:48 +04:00
|
|
|
} *rename_src;
|
|
|
|
static int rename_src_nr, rename_src_alloc;
|
2005-05-21 13:39:09 +04:00
|
|
|
|
2011-01-07 00:50:05 +03:00
|
|
|
static struct diff_rename_src *register_rename_src(struct diff_filepair *p)
|
2005-05-24 12:10:48 +04:00
|
|
|
{
|
|
|
|
int first, last;
|
2011-01-07 00:50:05 +03:00
|
|
|
struct diff_filespec *one = p->one;
|
|
|
|
unsigned short score = p->score;
|
2005-05-24 12:10:48 +04:00
|
|
|
|
|
|
|
first = 0;
|
|
|
|
last = rename_src_nr;
|
|
|
|
while (last > first) {
|
|
|
|
int next = (last + first) >> 1;
|
|
|
|
struct diff_rename_src *src = &(rename_src[next]);
|
2011-01-07 00:50:05 +03:00
|
|
|
int cmp = strcmp(one->path, src->p->one->path);
|
2005-05-24 12:10:48 +04:00
|
|
|
if (!cmp)
|
|
|
|
return src;
|
|
|
|
if (cmp < 0) {
|
|
|
|
last = next;
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
first = next+1;
|
|
|
|
}
|
2005-05-28 02:55:55 +04:00
|
|
|
|
2005-05-24 12:10:48 +04:00
|
|
|
/* insert to make it at "first" */
|
|
|
|
if (rename_src_alloc <= rename_src_nr) {
|
|
|
|
rename_src_alloc = alloc_nr(rename_src_alloc);
|
|
|
|
rename_src = xrealloc(rename_src,
|
|
|
|
rename_src_alloc * sizeof(*rename_src));
|
2005-05-21 13:39:09 +04:00
|
|
|
}
|
2005-05-24 12:10:48 +04:00
|
|
|
rename_src_nr++;
|
|
|
|
if (first < rename_src_nr)
|
|
|
|
memmove(rename_src + first + 1, rename_src + first,
|
|
|
|
(rename_src_nr - first - 1) * sizeof(*rename_src));
|
2011-01-07 00:50:05 +03:00
|
|
|
rename_src[first].p = p;
|
2006-04-09 07:17:46 +04:00
|
|
|
rename_src[first].score = score;
|
2005-05-24 12:10:48 +04:00
|
|
|
return &(rename_src[first]);
|
2005-05-21 13:39:09 +04:00
|
|
|
}
|
|
|
|
|
2007-06-21 15:52:11 +04:00
|
|
|
static int basename_same(struct diff_filespec *src, struct diff_filespec *dst)
|
|
|
|
{
|
|
|
|
int src_len = strlen(src->path), dst_len = strlen(dst->path);
|
|
|
|
while (src_len && dst_len) {
|
|
|
|
char c1 = src->path[--src_len];
|
|
|
|
char c2 = dst->path[--dst_len];
|
|
|
|
if (c1 != c2)
|
|
|
|
return 0;
|
|
|
|
if (c1 == '/')
|
|
|
|
return 1;
|
|
|
|
}
|
|
|
|
return (!src_len || src->path[src_len - 1] == '/') &&
|
|
|
|
(!dst_len || dst->path[dst_len - 1] == '/');
|
|
|
|
}
|
|
|
|
|
2005-05-21 13:39:09 +04:00
|
|
|
struct diff_score {
|
2005-05-24 12:10:48 +04:00
|
|
|
int src; /* index in rename_src */
|
|
|
|
int dst; /* index in rename_dst */
|
Optimize rename detection for a huge diff
When there are N deleted paths and M created paths, we used to
allocate (N x M) "struct diff_score" that record how similar
each of the pair is, and picked the <src,dst> pair that gives
the best match first, and then went on to process worse matches.
This sorting is done so that when two new files in the postimage
that are similar to the same file deleted from the preimage, we
can process the more similar one first, and when processing the
second one, it can notice "Ah, the source I was planning to say
I am a copy of is already taken by somebody else" and continue
on to match itself with another file in the preimage with a
lessor match. This matters to a change introduced between
1.5.3.X series and 1.5.4-rc, that lets the code to favor unused
matches first and then falls back to using already used
matches.
This instead allocates and keeps only a handful rename source
candidates per new files in the postimage. I.e. it makes the
memory requirement from O(N x M) to O(M).
For each dst, we compute similarlity with all sources (i.e. the
number of similarity estimate computations is still O(N x M)),
but we keep handful best src candidates for each dst.
Signed-off-by: Junio C Hamano <gitster@pobox.com>
2008-01-30 07:54:56 +03:00
|
|
|
unsigned short score;
|
|
|
|
short name_score;
|
2005-05-21 13:39:09 +04:00
|
|
|
};
|
|
|
|
|
|
|
|
static int estimate_similarity(struct diff_filespec *src,
|
|
|
|
struct diff_filespec *dst,
|
|
|
|
int minimum_score)
|
|
|
|
{
|
|
|
|
/* src points at a file that existed in the original tree (or
|
|
|
|
* optionally a file in the destination tree) and dst points
|
|
|
|
* at a newly created file. They may be quite similar, in which
|
|
|
|
* case we want to say src is renamed to dst or src is copied into
|
|
|
|
* dst, and then some edit has been applied to dst.
|
|
|
|
*
|
|
|
|
* Compare them and return how similar they are, representing
|
2005-05-28 02:56:38 +04:00
|
|
|
* the score as an integer between 0 and MAX_SCORE.
|
|
|
|
*
|
|
|
|
* When there is an exact match, it is considered a better
|
|
|
|
* match than anything else; the destination does not even
|
|
|
|
* call into this function in that case.
|
2005-05-21 13:39:09 +04:00
|
|
|
*/
|
2006-03-13 09:26:34 +03:00
|
|
|
unsigned long max_size, delta_size, base_size, src_copied, literal_added;
|
2005-06-29 03:58:27 +04:00
|
|
|
unsigned long delta_limit;
|
2005-05-21 13:39:09 +04:00
|
|
|
int score;
|
|
|
|
|
2005-05-23 08:24:49 +04:00
|
|
|
/* We deal only with regular files. Symlink renames are handled
|
|
|
|
* only when they are exact matches --- in other words, no edits
|
|
|
|
* after renaming.
|
|
|
|
*/
|
|
|
|
if (!S_ISREG(src->mode) || !S_ISREG(dst->mode))
|
|
|
|
return 0;
|
|
|
|
|
2007-10-27 03:51:28 +04:00
|
|
|
/*
|
|
|
|
* Need to check that source and destination sizes are
|
|
|
|
* filled in before comparing them.
|
|
|
|
*
|
|
|
|
* If we already have "cnt_data" filled in, we know it's
|
|
|
|
* all good (avoid checking the size for zero, as that
|
|
|
|
* is a possible size - we really should have a flag to
|
|
|
|
* say whether the size is valid or not!)
|
|
|
|
*/
|
2009-01-20 18:59:57 +03:00
|
|
|
if (!src->cnt_data && diff_populate_filespec(src, 1))
|
2007-10-27 03:51:28 +04:00
|
|
|
return 0;
|
2009-01-20 18:59:57 +03:00
|
|
|
if (!dst->cnt_data && diff_populate_filespec(dst, 1))
|
2007-10-27 03:51:28 +04:00
|
|
|
return 0;
|
|
|
|
|
2006-03-13 09:26:34 +03:00
|
|
|
max_size = ((src->size > dst->size) ? src->size : dst->size);
|
2005-05-22 02:55:18 +04:00
|
|
|
base_size = ((src->size < dst->size) ? src->size : dst->size);
|
2006-03-13 09:26:34 +03:00
|
|
|
delta_size = max_size - base_size;
|
2005-05-21 13:39:09 +04:00
|
|
|
|
2005-05-22 02:55:18 +04:00
|
|
|
/* We would not consider edits that change the file size so
|
|
|
|
* drastically. delta_size must be smaller than
|
2005-05-22 12:31:28 +04:00
|
|
|
* (MAX_SCORE-minimum_score)/MAX_SCORE * min(src->size, dst->size).
|
2005-05-28 02:56:38 +04:00
|
|
|
*
|
2005-05-22 02:55:18 +04:00
|
|
|
* Note that base_size == 0 case is handled here already
|
|
|
|
* and the final score computation below would not have a
|
|
|
|
* divide-by-zero issue.
|
2005-05-21 13:39:09 +04:00
|
|
|
*/
|
2011-02-19 07:12:06 +03:00
|
|
|
if (max_size * (MAX_SCORE-minimum_score) < delta_size * MAX_SCORE)
|
2005-05-21 13:39:09 +04:00
|
|
|
return 0;
|
|
|
|
|
2009-01-20 18:59:57 +03:00
|
|
|
if (!src->cnt_data && diff_populate_filespec(src, 0))
|
|
|
|
return 0;
|
|
|
|
if (!dst->cnt_data && diff_populate_filespec(dst, 0))
|
|
|
|
return 0;
|
|
|
|
|
2007-03-07 04:44:37 +03:00
|
|
|
delta_limit = (unsigned long)
|
|
|
|
(base_size * (MAX_SCORE-minimum_score) / MAX_SCORE);
|
2007-06-29 09:54:37 +04:00
|
|
|
if (diffcore_count_changes(src, dst,
|
2006-03-12 14:22:10 +03:00
|
|
|
&src->cnt_data, &dst->cnt_data,
|
2006-03-01 03:01:36 +03:00
|
|
|
delta_limit,
|
|
|
|
&src_copied, &literal_added))
|
2005-05-24 23:09:32 +04:00
|
|
|
return 0;
|
2005-06-03 12:36:03 +04:00
|
|
|
|
2006-03-03 09:11:25 +03:00
|
|
|
/* How similar are they?
|
|
|
|
* what percentage of material in dst are from source?
|
2005-05-21 13:39:09 +04:00
|
|
|
*/
|
2006-03-13 09:26:34 +03:00
|
|
|
if (!dst->size)
|
2006-03-03 09:11:25 +03:00
|
|
|
score = 0; /* should not happen */
|
2007-06-25 02:23:28 +04:00
|
|
|
else
|
2007-03-07 04:44:37 +03:00
|
|
|
score = (int)(src_copied * MAX_SCORE / max_size);
|
2005-05-21 13:39:09 +04:00
|
|
|
return score;
|
|
|
|
}
|
|
|
|
|
2005-09-16 03:13:43 +04:00
|
|
|
static void record_rename_pair(int dst_index, int src_index, int score)
|
2005-05-21 13:39:09 +04:00
|
|
|
{
|
2007-10-25 22:19:10 +04:00
|
|
|
struct diff_filespec *src, *dst;
|
2005-05-24 12:10:48 +04:00
|
|
|
struct diff_filepair *dp;
|
[PATCH] Rename/copy detection fix.
The rename/copy detection logic in earlier round was only good
enough to show patch output and discussion on the mailing list
about the diff-raw format updates revealed many problems with
it. This patch fixes all the ones known to me, without making
things I want to do later impossible, mostly related to patch
reordering.
(1) Earlier rename/copy detector determined which one is rename
and which one is copy too early, which made it impossible
to later introduce diffcore transformers to reorder
patches. This patch fixes it by moving that logic to the
very end of the processing.
(2) Earlier output routine diff_flush() was pruning all the
"no-change" entries indiscriminatingly. This was done due
to my false assumption that one of the requirements in the
diff-raw output was not to show such an entry (which
resulted in my incorrect comment about "diff-helper never
being able to be equivalent to built-in diff driver"). My
special thanks go to Linus for correcting me about this.
When we produce diff-raw output, for the downstream to be
able to tell renames from copies, sometimes it _is_
necessary to output "no-change" entries, and this patch
adds diffcore_prune() function for doing it.
(3) Earlier diff_filepair structure was trying to be not too
specific about rename/copy operations, but the purpose of
the structure was to record one or two paths, which _was_
indeed about rename/copy. This patch discards xfrm_msg
field which was trying to be generic for this wrong reason,
and introduces a couple of fields (rename_score and
rename_rank) that are explicitly specific to rename/copy
logic. One thing to note is that the information in a
single diff_filepair structure _still_ does not distinguish
renames from copies, and it is deliberately so. This is to
allow patches to be reordered in later stages.
(4) This patch also adds some tests about diff-raw format
output and makes sure that necessary "no-change" entries
appear on the output.
Signed-off-by: Junio C Hamano <junkio@cox.net>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2005-05-23 08:26:09 +04:00
|
|
|
|
2005-05-24 12:10:48 +04:00
|
|
|
if (rename_dst[dst_index].pair)
|
|
|
|
die("internal error: dst already matched.");
|
2005-05-21 13:39:09 +04:00
|
|
|
|
2011-01-07 00:50:05 +03:00
|
|
|
src = rename_src[src_index].p->one;
|
2007-10-25 22:20:56 +04:00
|
|
|
src->rename_used++;
|
2007-10-25 22:19:10 +04:00
|
|
|
src->count++;
|
2005-05-21 13:39:09 +04:00
|
|
|
|
2005-05-24 12:10:48 +04:00
|
|
|
dst = rename_dst[dst_index].two;
|
2007-10-25 22:19:10 +04:00
|
|
|
dst->count++;
|
2005-05-21 13:39:09 +04:00
|
|
|
|
2007-10-25 22:19:10 +04:00
|
|
|
dp = diff_queue(NULL, src, dst);
|
2006-08-03 23:01:01 +04:00
|
|
|
dp->renamed_pair = 1;
|
2006-04-09 07:17:46 +04:00
|
|
|
if (!strcmp(src->path, dst->path))
|
|
|
|
dp->score = rename_src[src_index].score;
|
|
|
|
else
|
|
|
|
dp->score = score;
|
2005-05-24 12:10:48 +04:00
|
|
|
rename_dst[dst_index].pair = dp;
|
2005-05-21 13:39:09 +04:00
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* We sort the rename similarity matrix with the score, in descending
|
2005-05-28 02:55:55 +04:00
|
|
|
* order (the most similar first).
|
2005-05-21 13:39:09 +04:00
|
|
|
*/
|
|
|
|
static int score_compare(const void *a_, const void *b_)
|
|
|
|
{
|
|
|
|
const struct diff_score *a = a_, *b = b_;
|
2007-06-25 02:23:28 +04:00
|
|
|
|
Optimize rename detection for a huge diff
When there are N deleted paths and M created paths, we used to
allocate (N x M) "struct diff_score" that record how similar
each of the pair is, and picked the <src,dst> pair that gives
the best match first, and then went on to process worse matches.
This sorting is done so that when two new files in the postimage
that are similar to the same file deleted from the preimage, we
can process the more similar one first, and when processing the
second one, it can notice "Ah, the source I was planning to say
I am a copy of is already taken by somebody else" and continue
on to match itself with another file in the preimage with a
lessor match. This matters to a change introduced between
1.5.3.X series and 1.5.4-rc, that lets the code to favor unused
matches first and then falls back to using already used
matches.
This instead allocates and keeps only a handful rename source
candidates per new files in the postimage. I.e. it makes the
memory requirement from O(N x M) to O(M).
For each dst, we compute similarlity with all sources (i.e. the
number of similarity estimate computations is still O(N x M)),
but we keep handful best src candidates for each dst.
Signed-off-by: Junio C Hamano <gitster@pobox.com>
2008-01-30 07:54:56 +03:00
|
|
|
/* sink the unused ones to the bottom */
|
|
|
|
if (a->dst < 0)
|
|
|
|
return (0 <= b->dst);
|
|
|
|
else if (b->dst < 0)
|
|
|
|
return -1;
|
|
|
|
|
2007-06-25 02:23:28 +04:00
|
|
|
if (a->score == b->score)
|
|
|
|
return b->name_score - a->name_score;
|
|
|
|
|
2005-05-21 13:39:09 +04:00
|
|
|
return b->score - a->score;
|
|
|
|
}
|
|
|
|
|
2007-10-25 22:23:26 +04:00
|
|
|
struct file_similarity {
|
|
|
|
int src_dst, index;
|
|
|
|
struct diff_filespec *filespec;
|
|
|
|
struct file_similarity *next;
|
|
|
|
};
|
|
|
|
|
|
|
|
static int find_identical_files(struct file_similarity *src,
|
2011-02-19 06:55:19 +03:00
|
|
|
struct file_similarity *dst,
|
|
|
|
struct diff_options *options)
|
2007-10-25 22:23:26 +04:00
|
|
|
{
|
|
|
|
int renames = 0;
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Walk over all the destinations ...
|
|
|
|
*/
|
|
|
|
do {
|
Fix a pathological case in git detecting proper renames
Kumar Gala had a case in the u-boot archive with multiple renames of files
with identical contents, and git would turn those into multiple "copy"
operations of one of the sources, and just deleting the other sources.
This patch makes the git exact rename detection prefer to spread out the
renames over the multiple sources, rather than do multiple copies of one
source.
NOTE! The changes are a bit larger than required, because I also renamed
the variables named "one" and "two" to "target" and "source" respectively.
That makes the logic easier to follow, especially as the "one" was
illogically the target and not the soruce, for purely historical reasons
(this piece of code used to traverse over sources and targets in the wrong
order, and when we fixed that, we didn't fix the names back then. So I
fixed them now).
The important part of this change is just the trivial score calculations
for when files have identical contents:
/* Give higher scores to sources that haven't been used already */
score = !source->rename_used;
score += basename_same(source, target);
and when we have multiple choices we'll now pick the choice that gets the
best rename score, rather than only looking at whether the basename
matched.
It's worth noting a few gotchas:
- this scoring is currently only done for the "exact match" case.
In particular, in Kumar's example, even after this patch, the inexact
match case is still done as a copy+delete rather than as two renames:
delete mode 100644 board/cds/mpc8555cds/u-boot.lds
copy board/{cds => freescale}/mpc8541cds/u-boot.lds (97%)
rename board/{cds/mpc8541cds => freescale/mpc8555cds}/u-boot.lds (97%)
because apparently the "cds/mpc8541cds/u-boot.lds" copy looked
a bit more similar to both end results. That said, I *suspect* we just
have the exact same issue there - the similarity analysis just gave
identical (or at least very _close_ to identical) similarity points,
and we do not have any logic to prefer multiple renames over a
copy/delete there.
That is a separate patch.
- When you have identical contents and identical basenames, the actual
entry that is chosen is still picked fairly "at random" for the first
one (but the subsequent ones will prefer entries that haven't already
been used).
It's not actually really random, in that it actually depends on the
relative alphabetical order of the files (which in turn will have
impacted the order that the entries got hashed!), so it gives
consistent results that can be explained. But I wanted to point it out
as an issue for when anybody actually does cross-renames.
In Kumar's case the choice is the right one (and for a single normal
directory rename it should always be, since the relative alphabetical
sorting of the files will be identical), and we now get:
rename board/{cds => freescale}/mpc8541cds/init.S (100%)
rename board/{cds => freescale}/mpc8548cds/init.S (100%)
which is the "expected" answer. However, it might still be better to
change the pedantic "exact same basename" on/off choice into a more
graduated "how similar are the pathnames" scoring situation, in order
to be more likely to get the exact rename choice that people *expect*
to see, rather than other alternatives that may *technically* be
equally good, but are surprising to a human.
It's also unclear whether we should consider "basenames are equal" or
"have already used this as a source" to be more important. This gives them
equal weight, but I suspect we might want to just multiple the "basenames
are equal" weight by two, or something, to prefer equal basenames even if
that causes a copy/delete pair. I dunno.
Anyway, what I'm just saying in a really long-winded manner is that I
think this is right as-is, but it's not the complete solution, and it may
want some further tweaking in the future.
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
2007-11-30 00:30:13 +03:00
|
|
|
struct diff_filespec *target = dst->filespec;
|
2007-10-25 22:23:26 +04:00
|
|
|
struct file_similarity *p, *best;
|
Fix a pathological case in git detecting proper renames
Kumar Gala had a case in the u-boot archive with multiple renames of files
with identical contents, and git would turn those into multiple "copy"
operations of one of the sources, and just deleting the other sources.
This patch makes the git exact rename detection prefer to spread out the
renames over the multiple sources, rather than do multiple copies of one
source.
NOTE! The changes are a bit larger than required, because I also renamed
the variables named "one" and "two" to "target" and "source" respectively.
That makes the logic easier to follow, especially as the "one" was
illogically the target and not the soruce, for purely historical reasons
(this piece of code used to traverse over sources and targets in the wrong
order, and when we fixed that, we didn't fix the names back then. So I
fixed them now).
The important part of this change is just the trivial score calculations
for when files have identical contents:
/* Give higher scores to sources that haven't been used already */
score = !source->rename_used;
score += basename_same(source, target);
and when we have multiple choices we'll now pick the choice that gets the
best rename score, rather than only looking at whether the basename
matched.
It's worth noting a few gotchas:
- this scoring is currently only done for the "exact match" case.
In particular, in Kumar's example, even after this patch, the inexact
match case is still done as a copy+delete rather than as two renames:
delete mode 100644 board/cds/mpc8555cds/u-boot.lds
copy board/{cds => freescale}/mpc8541cds/u-boot.lds (97%)
rename board/{cds/mpc8541cds => freescale/mpc8555cds}/u-boot.lds (97%)
because apparently the "cds/mpc8541cds/u-boot.lds" copy looked
a bit more similar to both end results. That said, I *suspect* we just
have the exact same issue there - the similarity analysis just gave
identical (or at least very _close_ to identical) similarity points,
and we do not have any logic to prefer multiple renames over a
copy/delete there.
That is a separate patch.
- When you have identical contents and identical basenames, the actual
entry that is chosen is still picked fairly "at random" for the first
one (but the subsequent ones will prefer entries that haven't already
been used).
It's not actually really random, in that it actually depends on the
relative alphabetical order of the files (which in turn will have
impacted the order that the entries got hashed!), so it gives
consistent results that can be explained. But I wanted to point it out
as an issue for when anybody actually does cross-renames.
In Kumar's case the choice is the right one (and for a single normal
directory rename it should always be, since the relative alphabetical
sorting of the files will be identical), and we now get:
rename board/{cds => freescale}/mpc8541cds/init.S (100%)
rename board/{cds => freescale}/mpc8548cds/init.S (100%)
which is the "expected" answer. However, it might still be better to
change the pedantic "exact same basename" on/off choice into a more
graduated "how similar are the pathnames" scoring situation, in order
to be more likely to get the exact rename choice that people *expect*
to see, rather than other alternatives that may *technically* be
equally good, but are surprising to a human.
It's also unclear whether we should consider "basenames are equal" or
"have already used this as a source" to be more important. This gives them
equal weight, but I suspect we might want to just multiple the "basenames
are equal" weight by two, or something, to prefer equal basenames even if
that causes a copy/delete pair. I dunno.
Anyway, what I'm just saying in a really long-winded manner is that I
think this is right as-is, but it's not the complete solution, and it may
want some further tweaking in the future.
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
2007-11-30 00:30:13 +03:00
|
|
|
int i = 100, best_score = -1;
|
2007-10-25 22:23:26 +04:00
|
|
|
|
|
|
|
/*
|
|
|
|
* .. to find the best source match
|
|
|
|
*/
|
|
|
|
best = NULL;
|
|
|
|
for (p = src; p; p = p->next) {
|
Fix a pathological case in git detecting proper renames
Kumar Gala had a case in the u-boot archive with multiple renames of files
with identical contents, and git would turn those into multiple "copy"
operations of one of the sources, and just deleting the other sources.
This patch makes the git exact rename detection prefer to spread out the
renames over the multiple sources, rather than do multiple copies of one
source.
NOTE! The changes are a bit larger than required, because I also renamed
the variables named "one" and "two" to "target" and "source" respectively.
That makes the logic easier to follow, especially as the "one" was
illogically the target and not the soruce, for purely historical reasons
(this piece of code used to traverse over sources and targets in the wrong
order, and when we fixed that, we didn't fix the names back then. So I
fixed them now).
The important part of this change is just the trivial score calculations
for when files have identical contents:
/* Give higher scores to sources that haven't been used already */
score = !source->rename_used;
score += basename_same(source, target);
and when we have multiple choices we'll now pick the choice that gets the
best rename score, rather than only looking at whether the basename
matched.
It's worth noting a few gotchas:
- this scoring is currently only done for the "exact match" case.
In particular, in Kumar's example, even after this patch, the inexact
match case is still done as a copy+delete rather than as two renames:
delete mode 100644 board/cds/mpc8555cds/u-boot.lds
copy board/{cds => freescale}/mpc8541cds/u-boot.lds (97%)
rename board/{cds/mpc8541cds => freescale/mpc8555cds}/u-boot.lds (97%)
because apparently the "cds/mpc8541cds/u-boot.lds" copy looked
a bit more similar to both end results. That said, I *suspect* we just
have the exact same issue there - the similarity analysis just gave
identical (or at least very _close_ to identical) similarity points,
and we do not have any logic to prefer multiple renames over a
copy/delete there.
That is a separate patch.
- When you have identical contents and identical basenames, the actual
entry that is chosen is still picked fairly "at random" for the first
one (but the subsequent ones will prefer entries that haven't already
been used).
It's not actually really random, in that it actually depends on the
relative alphabetical order of the files (which in turn will have
impacted the order that the entries got hashed!), so it gives
consistent results that can be explained. But I wanted to point it out
as an issue for when anybody actually does cross-renames.
In Kumar's case the choice is the right one (and for a single normal
directory rename it should always be, since the relative alphabetical
sorting of the files will be identical), and we now get:
rename board/{cds => freescale}/mpc8541cds/init.S (100%)
rename board/{cds => freescale}/mpc8548cds/init.S (100%)
which is the "expected" answer. However, it might still be better to
change the pedantic "exact same basename" on/off choice into a more
graduated "how similar are the pathnames" scoring situation, in order
to be more likely to get the exact rename choice that people *expect*
to see, rather than other alternatives that may *technically* be
equally good, but are surprising to a human.
It's also unclear whether we should consider "basenames are equal" or
"have already used this as a source" to be more important. This gives them
equal weight, but I suspect we might want to just multiple the "basenames
are equal" weight by two, or something, to prefer equal basenames even if
that causes a copy/delete pair. I dunno.
Anyway, what I'm just saying in a really long-winded manner is that I
think this is right as-is, but it's not the complete solution, and it may
want some further tweaking in the future.
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
2007-11-30 00:30:13 +03:00
|
|
|
int score;
|
|
|
|
struct diff_filespec *source = p->filespec;
|
2007-10-25 22:23:26 +04:00
|
|
|
|
2009-04-17 22:13:30 +04:00
|
|
|
/* False hash collision? */
|
Fix a pathological case in git detecting proper renames
Kumar Gala had a case in the u-boot archive with multiple renames of files
with identical contents, and git would turn those into multiple "copy"
operations of one of the sources, and just deleting the other sources.
This patch makes the git exact rename detection prefer to spread out the
renames over the multiple sources, rather than do multiple copies of one
source.
NOTE! The changes are a bit larger than required, because I also renamed
the variables named "one" and "two" to "target" and "source" respectively.
That makes the logic easier to follow, especially as the "one" was
illogically the target and not the soruce, for purely historical reasons
(this piece of code used to traverse over sources and targets in the wrong
order, and when we fixed that, we didn't fix the names back then. So I
fixed them now).
The important part of this change is just the trivial score calculations
for when files have identical contents:
/* Give higher scores to sources that haven't been used already */
score = !source->rename_used;
score += basename_same(source, target);
and when we have multiple choices we'll now pick the choice that gets the
best rename score, rather than only looking at whether the basename
matched.
It's worth noting a few gotchas:
- this scoring is currently only done for the "exact match" case.
In particular, in Kumar's example, even after this patch, the inexact
match case is still done as a copy+delete rather than as two renames:
delete mode 100644 board/cds/mpc8555cds/u-boot.lds
copy board/{cds => freescale}/mpc8541cds/u-boot.lds (97%)
rename board/{cds/mpc8541cds => freescale/mpc8555cds}/u-boot.lds (97%)
because apparently the "cds/mpc8541cds/u-boot.lds" copy looked
a bit more similar to both end results. That said, I *suspect* we just
have the exact same issue there - the similarity analysis just gave
identical (or at least very _close_ to identical) similarity points,
and we do not have any logic to prefer multiple renames over a
copy/delete there.
That is a separate patch.
- When you have identical contents and identical basenames, the actual
entry that is chosen is still picked fairly "at random" for the first
one (but the subsequent ones will prefer entries that haven't already
been used).
It's not actually really random, in that it actually depends on the
relative alphabetical order of the files (which in turn will have
impacted the order that the entries got hashed!), so it gives
consistent results that can be explained. But I wanted to point it out
as an issue for when anybody actually does cross-renames.
In Kumar's case the choice is the right one (and for a single normal
directory rename it should always be, since the relative alphabetical
sorting of the files will be identical), and we now get:
rename board/{cds => freescale}/mpc8541cds/init.S (100%)
rename board/{cds => freescale}/mpc8548cds/init.S (100%)
which is the "expected" answer. However, it might still be better to
change the pedantic "exact same basename" on/off choice into a more
graduated "how similar are the pathnames" scoring situation, in order
to be more likely to get the exact rename choice that people *expect*
to see, rather than other alternatives that may *technically* be
equally good, but are surprising to a human.
It's also unclear whether we should consider "basenames are equal" or
"have already used this as a source" to be more important. This gives them
equal weight, but I suspect we might want to just multiple the "basenames
are equal" weight by two, or something, to prefer equal basenames even if
that causes a copy/delete pair. I dunno.
Anyway, what I'm just saying in a really long-winded manner is that I
think this is right as-is, but it's not the complete solution, and it may
want some further tweaking in the future.
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
2007-11-30 00:30:13 +03:00
|
|
|
if (hashcmp(source->sha1, target->sha1))
|
2007-10-25 22:23:26 +04:00
|
|
|
continue;
|
|
|
|
/* Non-regular files? If so, the modes must match! */
|
Fix a pathological case in git detecting proper renames
Kumar Gala had a case in the u-boot archive with multiple renames of files
with identical contents, and git would turn those into multiple "copy"
operations of one of the sources, and just deleting the other sources.
This patch makes the git exact rename detection prefer to spread out the
renames over the multiple sources, rather than do multiple copies of one
source.
NOTE! The changes are a bit larger than required, because I also renamed
the variables named "one" and "two" to "target" and "source" respectively.
That makes the logic easier to follow, especially as the "one" was
illogically the target and not the soruce, for purely historical reasons
(this piece of code used to traverse over sources and targets in the wrong
order, and when we fixed that, we didn't fix the names back then. So I
fixed them now).
The important part of this change is just the trivial score calculations
for when files have identical contents:
/* Give higher scores to sources that haven't been used already */
score = !source->rename_used;
score += basename_same(source, target);
and when we have multiple choices we'll now pick the choice that gets the
best rename score, rather than only looking at whether the basename
matched.
It's worth noting a few gotchas:
- this scoring is currently only done for the "exact match" case.
In particular, in Kumar's example, even after this patch, the inexact
match case is still done as a copy+delete rather than as two renames:
delete mode 100644 board/cds/mpc8555cds/u-boot.lds
copy board/{cds => freescale}/mpc8541cds/u-boot.lds (97%)
rename board/{cds/mpc8541cds => freescale/mpc8555cds}/u-boot.lds (97%)
because apparently the "cds/mpc8541cds/u-boot.lds" copy looked
a bit more similar to both end results. That said, I *suspect* we just
have the exact same issue there - the similarity analysis just gave
identical (or at least very _close_ to identical) similarity points,
and we do not have any logic to prefer multiple renames over a
copy/delete there.
That is a separate patch.
- When you have identical contents and identical basenames, the actual
entry that is chosen is still picked fairly "at random" for the first
one (but the subsequent ones will prefer entries that haven't already
been used).
It's not actually really random, in that it actually depends on the
relative alphabetical order of the files (which in turn will have
impacted the order that the entries got hashed!), so it gives
consistent results that can be explained. But I wanted to point it out
as an issue for when anybody actually does cross-renames.
In Kumar's case the choice is the right one (and for a single normal
directory rename it should always be, since the relative alphabetical
sorting of the files will be identical), and we now get:
rename board/{cds => freescale}/mpc8541cds/init.S (100%)
rename board/{cds => freescale}/mpc8548cds/init.S (100%)
which is the "expected" answer. However, it might still be better to
change the pedantic "exact same basename" on/off choice into a more
graduated "how similar are the pathnames" scoring situation, in order
to be more likely to get the exact rename choice that people *expect*
to see, rather than other alternatives that may *technically* be
equally good, but are surprising to a human.
It's also unclear whether we should consider "basenames are equal" or
"have already used this as a source" to be more important. This gives them
equal weight, but I suspect we might want to just multiple the "basenames
are equal" weight by two, or something, to prefer equal basenames even if
that causes a copy/delete pair. I dunno.
Anyway, what I'm just saying in a really long-winded manner is that I
think this is right as-is, but it's not the complete solution, and it may
want some further tweaking in the future.
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
2007-11-30 00:30:13 +03:00
|
|
|
if (!S_ISREG(source->mode) || !S_ISREG(target->mode)) {
|
|
|
|
if (source->mode != target->mode)
|
2007-10-25 22:23:26 +04:00
|
|
|
continue;
|
|
|
|
}
|
Fix a pathological case in git detecting proper renames
Kumar Gala had a case in the u-boot archive with multiple renames of files
with identical contents, and git would turn those into multiple "copy"
operations of one of the sources, and just deleting the other sources.
This patch makes the git exact rename detection prefer to spread out the
renames over the multiple sources, rather than do multiple copies of one
source.
NOTE! The changes are a bit larger than required, because I also renamed
the variables named "one" and "two" to "target" and "source" respectively.
That makes the logic easier to follow, especially as the "one" was
illogically the target and not the soruce, for purely historical reasons
(this piece of code used to traverse over sources and targets in the wrong
order, and when we fixed that, we didn't fix the names back then. So I
fixed them now).
The important part of this change is just the trivial score calculations
for when files have identical contents:
/* Give higher scores to sources that haven't been used already */
score = !source->rename_used;
score += basename_same(source, target);
and when we have multiple choices we'll now pick the choice that gets the
best rename score, rather than only looking at whether the basename
matched.
It's worth noting a few gotchas:
- this scoring is currently only done for the "exact match" case.
In particular, in Kumar's example, even after this patch, the inexact
match case is still done as a copy+delete rather than as two renames:
delete mode 100644 board/cds/mpc8555cds/u-boot.lds
copy board/{cds => freescale}/mpc8541cds/u-boot.lds (97%)
rename board/{cds/mpc8541cds => freescale/mpc8555cds}/u-boot.lds (97%)
because apparently the "cds/mpc8541cds/u-boot.lds" copy looked
a bit more similar to both end results. That said, I *suspect* we just
have the exact same issue there - the similarity analysis just gave
identical (or at least very _close_ to identical) similarity points,
and we do not have any logic to prefer multiple renames over a
copy/delete there.
That is a separate patch.
- When you have identical contents and identical basenames, the actual
entry that is chosen is still picked fairly "at random" for the first
one (but the subsequent ones will prefer entries that haven't already
been used).
It's not actually really random, in that it actually depends on the
relative alphabetical order of the files (which in turn will have
impacted the order that the entries got hashed!), so it gives
consistent results that can be explained. But I wanted to point it out
as an issue for when anybody actually does cross-renames.
In Kumar's case the choice is the right one (and for a single normal
directory rename it should always be, since the relative alphabetical
sorting of the files will be identical), and we now get:
rename board/{cds => freescale}/mpc8541cds/init.S (100%)
rename board/{cds => freescale}/mpc8548cds/init.S (100%)
which is the "expected" answer. However, it might still be better to
change the pedantic "exact same basename" on/off choice into a more
graduated "how similar are the pathnames" scoring situation, in order
to be more likely to get the exact rename choice that people *expect*
to see, rather than other alternatives that may *technically* be
equally good, but are surprising to a human.
It's also unclear whether we should consider "basenames are equal" or
"have already used this as a source" to be more important. This gives them
equal weight, but I suspect we might want to just multiple the "basenames
are equal" weight by two, or something, to prefer equal basenames even if
that causes a copy/delete pair. I dunno.
Anyway, what I'm just saying in a really long-winded manner is that I
think this is right as-is, but it's not the complete solution, and it may
want some further tweaking in the future.
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
2007-11-30 00:30:13 +03:00
|
|
|
/* Give higher scores to sources that haven't been used already */
|
|
|
|
score = !source->rename_used;
|
2011-02-19 07:10:32 +03:00
|
|
|
if (source->rename_used && options->detect_rename != DIFF_DETECT_COPY)
|
|
|
|
continue;
|
Fix a pathological case in git detecting proper renames
Kumar Gala had a case in the u-boot archive with multiple renames of files
with identical contents, and git would turn those into multiple "copy"
operations of one of the sources, and just deleting the other sources.
This patch makes the git exact rename detection prefer to spread out the
renames over the multiple sources, rather than do multiple copies of one
source.
NOTE! The changes are a bit larger than required, because I also renamed
the variables named "one" and "two" to "target" and "source" respectively.
That makes the logic easier to follow, especially as the "one" was
illogically the target and not the soruce, for purely historical reasons
(this piece of code used to traverse over sources and targets in the wrong
order, and when we fixed that, we didn't fix the names back then. So I
fixed them now).
The important part of this change is just the trivial score calculations
for when files have identical contents:
/* Give higher scores to sources that haven't been used already */
score = !source->rename_used;
score += basename_same(source, target);
and when we have multiple choices we'll now pick the choice that gets the
best rename score, rather than only looking at whether the basename
matched.
It's worth noting a few gotchas:
- this scoring is currently only done for the "exact match" case.
In particular, in Kumar's example, even after this patch, the inexact
match case is still done as a copy+delete rather than as two renames:
delete mode 100644 board/cds/mpc8555cds/u-boot.lds
copy board/{cds => freescale}/mpc8541cds/u-boot.lds (97%)
rename board/{cds/mpc8541cds => freescale/mpc8555cds}/u-boot.lds (97%)
because apparently the "cds/mpc8541cds/u-boot.lds" copy looked
a bit more similar to both end results. That said, I *suspect* we just
have the exact same issue there - the similarity analysis just gave
identical (or at least very _close_ to identical) similarity points,
and we do not have any logic to prefer multiple renames over a
copy/delete there.
That is a separate patch.
- When you have identical contents and identical basenames, the actual
entry that is chosen is still picked fairly "at random" for the first
one (but the subsequent ones will prefer entries that haven't already
been used).
It's not actually really random, in that it actually depends on the
relative alphabetical order of the files (which in turn will have
impacted the order that the entries got hashed!), so it gives
consistent results that can be explained. But I wanted to point it out
as an issue for when anybody actually does cross-renames.
In Kumar's case the choice is the right one (and for a single normal
directory rename it should always be, since the relative alphabetical
sorting of the files will be identical), and we now get:
rename board/{cds => freescale}/mpc8541cds/init.S (100%)
rename board/{cds => freescale}/mpc8548cds/init.S (100%)
which is the "expected" answer. However, it might still be better to
change the pedantic "exact same basename" on/off choice into a more
graduated "how similar are the pathnames" scoring situation, in order
to be more likely to get the exact rename choice that people *expect*
to see, rather than other alternatives that may *technically* be
equally good, but are surprising to a human.
It's also unclear whether we should consider "basenames are equal" or
"have already used this as a source" to be more important. This gives them
equal weight, but I suspect we might want to just multiple the "basenames
are equal" weight by two, or something, to prefer equal basenames even if
that causes a copy/delete pair. I dunno.
Anyway, what I'm just saying in a really long-winded manner is that I
think this is right as-is, but it's not the complete solution, and it may
want some further tweaking in the future.
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
2007-11-30 00:30:13 +03:00
|
|
|
score += basename_same(source, target);
|
|
|
|
if (score > best_score) {
|
|
|
|
best = p;
|
|
|
|
best_score = score;
|
|
|
|
if (score == 2)
|
|
|
|
break;
|
|
|
|
}
|
2007-10-25 22:23:26 +04:00
|
|
|
|
|
|
|
/* Too many identical alternatives? Pick one */
|
|
|
|
if (!--i)
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
if (best) {
|
|
|
|
record_rename_pair(dst->index, best->index, MAX_SCORE);
|
|
|
|
renames++;
|
|
|
|
}
|
|
|
|
} while ((dst = dst->next) != NULL);
|
|
|
|
return renames;
|
|
|
|
}
|
|
|
|
|
|
|
|
static void free_similarity_list(struct file_similarity *p)
|
|
|
|
{
|
|
|
|
while (p) {
|
|
|
|
struct file_similarity *entry = p;
|
|
|
|
p = p->next;
|
|
|
|
free(entry);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2011-02-19 06:55:19 +03:00
|
|
|
static int find_same_files(void *ptr, void *data)
|
2007-10-25 22:23:26 +04:00
|
|
|
{
|
|
|
|
int ret;
|
|
|
|
struct file_similarity *p = ptr;
|
|
|
|
struct file_similarity *src = NULL, *dst = NULL;
|
2011-02-19 06:55:19 +03:00
|
|
|
struct diff_options *options = data;
|
2007-10-25 22:23:26 +04:00
|
|
|
|
|
|
|
/* Split the hash list up into sources and destinations */
|
|
|
|
do {
|
|
|
|
struct file_similarity *entry = p;
|
|
|
|
p = p->next;
|
|
|
|
if (entry->src_dst < 0) {
|
|
|
|
entry->next = src;
|
|
|
|
src = entry;
|
|
|
|
} else {
|
|
|
|
entry->next = dst;
|
|
|
|
dst = entry;
|
|
|
|
}
|
|
|
|
} while (p);
|
|
|
|
|
|
|
|
/*
|
|
|
|
* If we have both sources *and* destinations, see if
|
|
|
|
* we can match them up
|
|
|
|
*/
|
2011-02-19 06:55:19 +03:00
|
|
|
ret = (src && dst) ? find_identical_files(src, dst, options) : 0;
|
2007-10-25 22:23:26 +04:00
|
|
|
|
|
|
|
/* Free the hashes and return the number of renames found */
|
|
|
|
free_similarity_list(src);
|
|
|
|
free_similarity_list(dst);
|
|
|
|
return ret;
|
|
|
|
}
|
|
|
|
|
|
|
|
static unsigned int hash_filespec(struct diff_filespec *filespec)
|
|
|
|
{
|
|
|
|
unsigned int hash;
|
|
|
|
if (!filespec->sha1_valid) {
|
|
|
|
if (diff_populate_filespec(filespec, 0))
|
|
|
|
return 0;
|
|
|
|
hash_sha1_file(filespec->data, filespec->size, "blob", filespec->sha1);
|
|
|
|
}
|
|
|
|
memcpy(&hash, filespec->sha1, sizeof(hash));
|
|
|
|
return hash;
|
|
|
|
}
|
|
|
|
|
|
|
|
static void insert_file_table(struct hash_table *table, int src_dst, int index, struct diff_filespec *filespec)
|
|
|
|
{
|
|
|
|
void **pos;
|
|
|
|
unsigned int hash;
|
|
|
|
struct file_similarity *entry = xmalloc(sizeof(*entry));
|
|
|
|
|
|
|
|
entry->src_dst = src_dst;
|
|
|
|
entry->index = index;
|
|
|
|
entry->filespec = filespec;
|
|
|
|
entry->next = NULL;
|
|
|
|
|
|
|
|
hash = hash_filespec(filespec);
|
|
|
|
pos = insert_hash(hash, entry, table);
|
|
|
|
|
|
|
|
/* We already had an entry there? */
|
|
|
|
if (pos) {
|
|
|
|
entry->next = *pos;
|
|
|
|
*pos = entry;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2007-10-25 22:17:55 +04:00
|
|
|
/*
|
|
|
|
* Find exact renames first.
|
|
|
|
*
|
|
|
|
* The first round matches up the up-to-date entries,
|
|
|
|
* and then during the second round we try to match
|
|
|
|
* cache-dirty entries as well.
|
|
|
|
*/
|
2011-02-19 06:55:19 +03:00
|
|
|
static int find_exact_renames(struct diff_options *options)
|
2007-10-25 22:17:55 +04:00
|
|
|
{
|
2007-10-25 22:23:26 +04:00
|
|
|
int i;
|
|
|
|
struct hash_table file_table;
|
2007-10-25 22:17:55 +04:00
|
|
|
|
2007-10-25 22:23:26 +04:00
|
|
|
init_hash(&file_table);
|
|
|
|
for (i = 0; i < rename_src_nr; i++)
|
2011-01-07 00:50:05 +03:00
|
|
|
insert_file_table(&file_table, -1, i, rename_src[i].p->one);
|
2007-10-25 22:23:26 +04:00
|
|
|
|
|
|
|
for (i = 0; i < rename_dst_nr; i++)
|
|
|
|
insert_file_table(&file_table, 1, i, rename_dst[i].two);
|
|
|
|
|
|
|
|
/* Find the renames */
|
2011-02-19 06:55:19 +03:00
|
|
|
i = for_each_hash(&file_table, find_same_files, options);
|
2007-10-25 22:23:26 +04:00
|
|
|
|
|
|
|
/* .. and free the hash data structure */
|
|
|
|
free_hash(&file_table);
|
|
|
|
|
|
|
|
return i;
|
2007-10-25 22:17:55 +04:00
|
|
|
}
|
|
|
|
|
Optimize rename detection for a huge diff
When there are N deleted paths and M created paths, we used to
allocate (N x M) "struct diff_score" that record how similar
each of the pair is, and picked the <src,dst> pair that gives
the best match first, and then went on to process worse matches.
This sorting is done so that when two new files in the postimage
that are similar to the same file deleted from the preimage, we
can process the more similar one first, and when processing the
second one, it can notice "Ah, the source I was planning to say
I am a copy of is already taken by somebody else" and continue
on to match itself with another file in the preimage with a
lessor match. This matters to a change introduced between
1.5.3.X series and 1.5.4-rc, that lets the code to favor unused
matches first and then falls back to using already used
matches.
This instead allocates and keeps only a handful rename source
candidates per new files in the postimage. I.e. it makes the
memory requirement from O(N x M) to O(M).
For each dst, we compute similarlity with all sources (i.e. the
number of similarity estimate computations is still O(N x M)),
but we keep handful best src candidates for each dst.
Signed-off-by: Junio C Hamano <gitster@pobox.com>
2008-01-30 07:54:56 +03:00
|
|
|
#define NUM_CANDIDATE_PER_DST 4
|
|
|
|
static void record_if_better(struct diff_score m[], struct diff_score *o)
|
|
|
|
{
|
|
|
|
int i, worst;
|
|
|
|
|
|
|
|
/* find the worst one */
|
|
|
|
worst = 0;
|
|
|
|
for (i = 1; i < NUM_CANDIDATE_PER_DST; i++)
|
|
|
|
if (score_compare(&m[i], &m[worst]) > 0)
|
|
|
|
worst = i;
|
|
|
|
|
|
|
|
/* is it better than the worst one? */
|
|
|
|
if (score_compare(&m[worst], o) > 0)
|
|
|
|
m[worst] = *o;
|
|
|
|
}
|
|
|
|
|
2011-01-07 00:50:06 +03:00
|
|
|
/*
|
|
|
|
* Returns:
|
|
|
|
* 0 if we are under the limit;
|
|
|
|
* 1 if we need to disable inexact rename detection;
|
|
|
|
* 2 if we would be under the limit if we were given -C instead of -C -C.
|
|
|
|
*/
|
2011-01-07 00:50:04 +03:00
|
|
|
static int too_many_rename_candidates(int num_create,
|
|
|
|
struct diff_options *options)
|
|
|
|
{
|
|
|
|
int rename_limit = options->rename_limit;
|
|
|
|
int num_src = rename_src_nr;
|
2011-01-07 00:50:06 +03:00
|
|
|
int i;
|
2011-01-07 00:50:04 +03:00
|
|
|
|
|
|
|
options->needed_rename_limit = 0;
|
|
|
|
|
|
|
|
/*
|
|
|
|
* This basically does a test for the rename matrix not
|
|
|
|
* growing larger than a "rename_limit" square matrix, ie:
|
|
|
|
*
|
|
|
|
* num_create * num_src > rename_limit * rename_limit
|
|
|
|
*
|
|
|
|
* but handles the potential overflow case specially (and we
|
|
|
|
* assume at least 32-bit integers)
|
|
|
|
*/
|
|
|
|
if (rename_limit <= 0 || rename_limit > 32767)
|
|
|
|
rename_limit = 32767;
|
|
|
|
if ((num_create <= rename_limit || num_src <= rename_limit) &&
|
|
|
|
(num_create * num_src <= rename_limit * rename_limit))
|
|
|
|
return 0;
|
|
|
|
|
|
|
|
options->needed_rename_limit =
|
|
|
|
num_src > num_create ? num_src : num_create;
|
2011-01-07 00:50:06 +03:00
|
|
|
|
|
|
|
/* Are we running under -C -C? */
|
|
|
|
if (!DIFF_OPT_TST(options, FIND_COPIES_HARDER))
|
|
|
|
return 1;
|
|
|
|
|
|
|
|
/* Would we bust the limit if we were running under -C? */
|
|
|
|
for (num_src = i = 0; i < rename_src_nr; i++) {
|
|
|
|
if (diff_unmodified_pair(rename_src[i].p))
|
|
|
|
continue;
|
|
|
|
num_src++;
|
|
|
|
}
|
|
|
|
if ((num_create <= rename_limit || num_src <= rename_limit) &&
|
|
|
|
(num_create * num_src <= rename_limit * rename_limit))
|
|
|
|
return 2;
|
2011-01-07 00:50:04 +03:00
|
|
|
return 1;
|
|
|
|
}
|
|
|
|
|
2011-02-19 07:10:32 +03:00
|
|
|
static int find_renames(struct diff_score *mx, int dst_cnt, int minimum_score, int copies)
|
|
|
|
{
|
|
|
|
int count = 0, i;
|
|
|
|
|
|
|
|
for (i = 0; i < dst_cnt * NUM_CANDIDATE_PER_DST; i++) {
|
|
|
|
struct diff_rename_dst *dst;
|
|
|
|
|
|
|
|
if ((mx[i].dst < 0) ||
|
|
|
|
(mx[i].score < minimum_score))
|
|
|
|
break; /* there is no more usable pair. */
|
|
|
|
dst = &rename_dst[mx[i].dst];
|
|
|
|
if (dst->pair)
|
|
|
|
continue; /* already done, either exact or fuzzy. */
|
2011-01-07 00:50:05 +03:00
|
|
|
if (!copies && rename_src[mx[i].src].p->one->rename_used)
|
2011-02-19 07:10:32 +03:00
|
|
|
continue;
|
|
|
|
record_rename_pair(mx[i].dst, mx[i].src, mx[i].score);
|
|
|
|
count++;
|
|
|
|
}
|
|
|
|
return count;
|
|
|
|
}
|
|
|
|
|
2005-09-21 11:18:27 +04:00
|
|
|
void diffcore_rename(struct diff_options *options)
|
2005-05-21 13:39:09 +04:00
|
|
|
{
|
2005-09-21 11:18:27 +04:00
|
|
|
int detect_rename = options->detect_rename;
|
|
|
|
int minimum_score = options->rename_score;
|
2005-05-22 06:40:36 +04:00
|
|
|
struct diff_queue_struct *q = &diff_queued_diff;
|
2005-09-16 03:13:43 +04:00
|
|
|
struct diff_queue_struct outq;
|
2005-05-21 13:39:09 +04:00
|
|
|
struct diff_score *mx;
|
2011-01-07 00:50:06 +03:00
|
|
|
int i, j, rename_count, skip_unmodified = 0;
|
2011-04-29 13:42:41 +04:00
|
|
|
int num_create, dst_cnt;
|
2011-02-20 12:51:16 +03:00
|
|
|
struct progress *progress = NULL;
|
2005-05-21 13:39:09 +04:00
|
|
|
|
2005-05-22 10:33:32 +04:00
|
|
|
if (!minimum_score)
|
[PATCH] Add -B flag to diff-* brothers.
A new diffcore transformation, diffcore-break.c, is introduced.
When the -B flag is given, a patch that represents a complete
rewrite is broken into a deletion followed by a creation. This
makes it easier to review such a complete rewrite patch.
The -B flag takes the same syntax as the -M and -C flags to
specify the minimum amount of non-source material the resulting
file needs to have to be considered a complete rewrite, and
defaults to 99% if not specified.
As the new test t4008-diff-break-rewrite.sh demonstrates, if a
file is a complete rewrite, it is broken into a delete/create
pair, which can further be subjected to the usual rename
detection if -M or -C is used. For example, if file0 gets
completely rewritten to make it as if it were rather based on
file1 which itself disappeared, the following happens:
The original change looks like this:
file0 --> file0' (quite different from file0)
file1 --> /dev/null
After diffcore-break runs, it would become this:
file0 --> /dev/null
/dev/null --> file0'
file1 --> /dev/null
Then diffcore-rename matches them up:
file1 --> file0'
The internal score values are finer grained now. Earlier
maximum of 10000 has been raised to 60000; there is no user
visible changes but there is no reason to waste available bits.
Signed-off-by: Junio C Hamano <junkio@cox.net>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2005-05-30 11:08:37 +04:00
|
|
|
minimum_score = DEFAULT_RENAME_SCORE;
|
2005-05-21 13:39:09 +04:00
|
|
|
|
|
|
|
for (i = 0; i < q->nr; i++) {
|
2005-05-21 13:40:01 +04:00
|
|
|
struct diff_filepair *p = q->queue[i];
|
2006-11-02 11:02:11 +03:00
|
|
|
if (!DIFF_FILE_VALID(p->one)) {
|
2005-05-22 06:42:18 +04:00
|
|
|
if (!DIFF_FILE_VALID(p->two))
|
2005-05-23 08:24:49 +04:00
|
|
|
continue; /* unmerged */
|
2006-11-02 11:02:11 +03:00
|
|
|
else if (options->single_follow &&
|
|
|
|
strcmp(options->single_follow, p->two->path))
|
|
|
|
continue; /* not interested */
|
teach diffcore-rename to optionally ignore empty content
Our rename detection is a heuristic, matching pairs of
removed and added files with similar or identical content.
It's unlikely to be wrong when there is actual content to
compare, and we already take care not to do inexact rename
detection when there is not enough content to produce good
results.
However, we always do exact rename detection, even when the
blob is tiny or empty. It's easy to get false positives with
an empty blob, simply because it is an obvious content to
use as a boilerplate (e.g., when telling git that an empty
directory is worth tracking via an empty .gitignore).
This patch lets callers specify whether or not they are
interested in using empty files as rename sources and
destinations. The default is "yes", keeping the original
behavior. It works by detecting the empty-blob sha1 for
rename sources and destinations.
One more flexible alternative would be to allow the caller
to specify a minimum size for a blob to be "interesting" for
rename detection. But that would catch small boilerplate
files, not large ones (e.g., if you had the GPL COPYING file
in many directories).
A better alternative would be to allow a "-rename"
gitattribute to allow boilerplate files to be marked as
such. I'll leave the complexity of that solution until such
time as somebody actually wants it. The complaints we've
seen so far revolve around empty files, so let's start with
the simple thing.
Signed-off-by: Jeff King <peff@peff.net>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
2012-03-23 02:52:13 +04:00
|
|
|
else if (!DIFF_OPT_TST(options, RENAME_EMPTY) &&
|
|
|
|
is_empty_blob_sha1(p->two->sha1))
|
|
|
|
continue;
|
2005-05-21 13:39:09 +04:00
|
|
|
else
|
2005-05-24 12:10:48 +04:00
|
|
|
locate_rename_dst(p->two, 1);
|
2006-11-02 11:02:11 +03:00
|
|
|
}
|
teach diffcore-rename to optionally ignore empty content
Our rename detection is a heuristic, matching pairs of
removed and added files with similar or identical content.
It's unlikely to be wrong when there is actual content to
compare, and we already take care not to do inexact rename
detection when there is not enough content to produce good
results.
However, we always do exact rename detection, even when the
blob is tiny or empty. It's easy to get false positives with
an empty blob, simply because it is an obvious content to
use as a boilerplate (e.g., when telling git that an empty
directory is worth tracking via an empty .gitignore).
This patch lets callers specify whether or not they are
interested in using empty files as rename sources and
destinations. The default is "yes", keeping the original
behavior. It works by detecting the empty-blob sha1 for
rename sources and destinations.
One more flexible alternative would be to allow the caller
to specify a minimum size for a blob to be "interesting" for
rename detection. But that would catch small boilerplate
files, not large ones (e.g., if you had the GPL COPYING file
in many directories).
A better alternative would be to allow a "-rename"
gitattribute to allow boilerplate files to be marked as
such. I'll leave the complexity of that solution until such
time as somebody actually wants it. The complaints we've
seen so far revolve around empty files, so let's start with
the simple thing.
Signed-off-by: Jeff King <peff@peff.net>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
2012-03-23 02:52:13 +04:00
|
|
|
else if (!DIFF_OPT_TST(options, RENAME_EMPTY) &&
|
|
|
|
is_empty_blob_sha1(p->one->sha1))
|
|
|
|
continue;
|
diffcore-rename: don't consider unmerged path as source
Since e9c8409 (diff-index --cached --raw: show tree entry on the LHS for
unmerged entries., 2007-01-05), an unmerged entry should be detected by
using DIFF_PAIR_UNMERGED(p), not by noticing both one and two sides of
the filepair records mode=0 entries. However, it forgot to update some
parts of the rename detection logic.
This only makes difference in the "diff --cached" codepath where an
unmerged filepair carries information on the entries that came from the
tree. It probably hasn't been noticed for a long time because nobody
would run "diff -M" during a conflict resolution, but "git status" uses
rename detection when it internally runs "diff-index" and "diff-files"
and gives nonsense results.
In an unmerged pair, "one" side can have a valid filespec to record the
tree entry (e.g. what's in HEAD) when running "diff --cached". This can
be used as a rename source to other paths in the index that are not
unmerged. The path that is unmerged by definition does not have the
final content yet (i.e. "two" side cannot have a valid filespec), so it
can never be a rename destination.
Use the DIFF_PAIR_UNMERGED() to detect unmerged filepair correctly, and
allow the valid "one" side of an unmerged filepair to be considered a
potential rename source, but never to be considered a rename destination.
Commit message and first two test cases by Junio, the rest by Martin.
Signed-off-by: Martin von Zweigbergk <martin.von.zweigbergk@gmail.com>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
2011-03-24 05:41:01 +03:00
|
|
|
else if (!DIFF_PAIR_UNMERGED(p) && !DIFF_FILE_VALID(p->two)) {
|
2007-10-25 22:20:56 +04:00
|
|
|
/*
|
|
|
|
* If the source is a broken "delete", and
|
2005-06-12 07:55:20 +04:00
|
|
|
* they did not really want to get broken,
|
|
|
|
* that means the source actually stays.
|
2007-10-25 22:20:56 +04:00
|
|
|
* So we increment the "rename_used" score
|
|
|
|
* by one, to indicate ourselves as a user
|
|
|
|
*/
|
|
|
|
if (p->broken_pair && !p->score)
|
|
|
|
p->one->rename_used++;
|
2011-01-07 00:50:05 +03:00
|
|
|
register_rename_src(p);
|
2007-10-25 22:20:56 +04:00
|
|
|
}
|
|
|
|
else if (detect_rename == DIFF_DETECT_COPY) {
|
|
|
|
/*
|
|
|
|
* Increment the "rename_used" score by
|
|
|
|
* one, to indicate ourselves as a user.
|
2005-06-12 07:55:20 +04:00
|
|
|
*/
|
2007-10-25 22:20:56 +04:00
|
|
|
p->one->rename_used++;
|
2011-01-07 00:50:05 +03:00
|
|
|
register_rename_src(p);
|
2005-06-12 07:55:20 +04:00
|
|
|
}
|
2005-05-21 13:39:09 +04:00
|
|
|
}
|
Fix the rename detection limit checking
This adds more proper rename detection limits. Instead of just checking
the limit against the number of potential rename destinations, we verify
that the rename matrix (which is what really matters) doesn't grow
ridiculously large, and we also make sure that we don't overflow when
doing the matrix size calculation.
This also changes the default limits from unlimited, to a rename matrix
that is limited to 100 entries on a side. You can raise it with the config
entry, or by using the "-l<n>" command line flag, but at least the default
is now a sane number that avoids spending lots of time (and memory) in
situations that likely don't merit it.
The choice of default value is of course very debatable. Limiting the
rename matrix to a 100x100 size will mean that even if you have just one
obvious rename, but you also create (or delete) 10,000 files, the rename
matrix will be so big that we disable the heuristics. Sounds reasonable to
me, but let's see if people hit this (and, perhaps more importantly,
actually *care*) in real life.
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
2007-09-14 21:39:48 +04:00
|
|
|
if (rename_dst_nr == 0 || rename_src_nr == 0)
|
2005-05-21 13:39:09 +04:00
|
|
|
goto cleanup; /* nothing to do */
|
|
|
|
|
Do exact rename detection regardless of rename limits
Now that the exact rename detection is linear-time (with a very small
constant factor to boot), there is no longer any reason to limit it by
the number of files involved.
In some trivial testing, I created a repository with a directory that
had a hundred thousand files in it (all with different contents), and
then moved that directory to show the effects of renaming 100,000 files.
With the new code, that resulted in
[torvalds@woody big-rename]$ time ~/git/git show -C | wc -l
400006
real 0m2.071s
user 0m1.520s
sys 0m0.576s
ie the code can correctly detect the hundred thousand renames in about 2
seconds (the number "400006" comes from four lines for each rename:
diff --git a/really-big-dir/file-1-1-1-1-1 b/moved-big-dir/file-1-1-1-1-1
similarity index 100%
rename from really-big-dir/file-1-1-1-1-1
rename to moved-big-dir/file-1-1-1-1-1
and the extra six lines is from a one-liner commit message and all the
commit information and spacing).
Most of those two seconds weren't even really the rename detection, it's
really all the other stuff needed to get there.
With the old code, this wouldn't have been practically possible. Doing
a pairwise check of the ten billion possible pairs would have been
prohibitively expensive. In fact, even with the rename limiter in
place, the old code would waste a lot of time just on the diff_filespec
checks, and despite not even trying to find renames, it used to look
like:
[torvalds@woody big-rename]$ time git show -C | wc -l
1400006
real 0m12.337s
user 0m12.285s
sys 0m0.192s
ie we used to take 12 seconds for this load and not even do any rename
detection! (The number 1400006 comes from fourteen lines per file moved:
seven lines each for the delete and the create of a one-liner file, and
the same extra six lines of commit information).
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
2007-10-25 22:24:47 +04:00
|
|
|
/*
|
|
|
|
* We really want to cull the candidates list early
|
|
|
|
* with cheap tests in order to avoid doing deltas.
|
|
|
|
*/
|
2011-02-19 06:55:19 +03:00
|
|
|
rename_count = find_exact_renames(options);
|
Do exact rename detection regardless of rename limits
Now that the exact rename detection is linear-time (with a very small
constant factor to boot), there is no longer any reason to limit it by
the number of files involved.
In some trivial testing, I created a repository with a directory that
had a hundred thousand files in it (all with different contents), and
then moved that directory to show the effects of renaming 100,000 files.
With the new code, that resulted in
[torvalds@woody big-rename]$ time ~/git/git show -C | wc -l
400006
real 0m2.071s
user 0m1.520s
sys 0m0.576s
ie the code can correctly detect the hundred thousand renames in about 2
seconds (the number "400006" comes from four lines for each rename:
diff --git a/really-big-dir/file-1-1-1-1-1 b/moved-big-dir/file-1-1-1-1-1
similarity index 100%
rename from really-big-dir/file-1-1-1-1-1
rename to moved-big-dir/file-1-1-1-1-1
and the extra six lines is from a one-liner commit message and all the
commit information and spacing).
Most of those two seconds weren't even really the rename detection, it's
really all the other stuff needed to get there.
With the old code, this wouldn't have been practically possible. Doing
a pairwise check of the ten billion possible pairs would have been
prohibitively expensive. In fact, even with the rename limiter in
place, the old code would waste a lot of time just on the diff_filespec
checks, and despite not even trying to find renames, it used to look
like:
[torvalds@woody big-rename]$ time git show -C | wc -l
1400006
real 0m12.337s
user 0m12.285s
sys 0m0.192s
ie we used to take 12 seconds for this load and not even do any rename
detection! (The number 1400006 comes from fourteen lines per file moved:
seven lines each for the delete and the create of a one-liner file, and
the same extra six lines of commit information).
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
2007-10-25 22:24:47 +04:00
|
|
|
|
2007-10-27 03:56:34 +04:00
|
|
|
/* Did we only want exact renames? */
|
|
|
|
if (minimum_score == MAX_SCORE)
|
|
|
|
goto cleanup;
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Calculate how many renames are left (but all the source
|
|
|
|
* files still remain as options for rename/copies!)
|
|
|
|
*/
|
|
|
|
num_create = (rename_dst_nr - rename_count);
|
|
|
|
|
|
|
|
/* All done? */
|
|
|
|
if (!num_create)
|
|
|
|
goto cleanup;
|
|
|
|
|
2011-01-07 00:50:06 +03:00
|
|
|
switch (too_many_rename_candidates(num_create, options)) {
|
|
|
|
case 1:
|
Fix the rename detection limit checking
This adds more proper rename detection limits. Instead of just checking
the limit against the number of potential rename destinations, we verify
that the rename matrix (which is what really matters) doesn't grow
ridiculously large, and we also make sure that we don't overflow when
doing the matrix size calculation.
This also changes the default limits from unlimited, to a rename matrix
that is limited to 100 entries on a side. You can raise it with the config
entry, or by using the "-l<n>" command line flag, but at least the default
is now a sane number that avoids spending lots of time (and memory) in
situations that likely don't merit it.
The choice of default value is of course very debatable. Limiting the
rename matrix to a 100x100 size will mean that even if you have just one
obvious rename, but you also create (or delete) 10,000 files, the rename
matrix will be so big that we disable the heuristics. Sounds reasonable to
me, but let's see if people hit this (and, perhaps more importantly,
actually *care*) in real life.
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
2007-09-14 21:39:48 +04:00
|
|
|
goto cleanup;
|
2011-01-07 00:50:06 +03:00
|
|
|
case 2:
|
|
|
|
options->degraded_cc_to_c = 1;
|
|
|
|
skip_unmodified = 1;
|
|
|
|
break;
|
|
|
|
default:
|
|
|
|
break;
|
|
|
|
}
|
Fix the rename detection limit checking
This adds more proper rename detection limits. Instead of just checking
the limit against the number of potential rename destinations, we verify
that the rename matrix (which is what really matters) doesn't grow
ridiculously large, and we also make sure that we don't overflow when
doing the matrix size calculation.
This also changes the default limits from unlimited, to a rename matrix
that is limited to 100 entries on a side. You can raise it with the config
entry, or by using the "-l<n>" command line flag, but at least the default
is now a sane number that avoids spending lots of time (and memory) in
situations that likely don't merit it.
The choice of default value is of course very debatable. Limiting the
rename matrix to a 100x100 size will mean that even if you have just one
obvious rename, but you also create (or delete) 10,000 files, the rename
matrix will be so big that we disable the heuristics. Sounds reasonable to
me, but let's see if people hit this (and, perhaps more importantly,
actually *care*) in real life.
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
2007-09-14 21:39:48 +04:00
|
|
|
|
2011-02-20 12:51:16 +03:00
|
|
|
if (options->show_rename_progress) {
|
|
|
|
progress = start_progress_delay(
|
|
|
|
"Performing inexact rename detection",
|
|
|
|
rename_dst_nr * rename_src_nr, 50, 1);
|
|
|
|
}
|
|
|
|
|
Optimize rename detection for a huge diff
When there are N deleted paths and M created paths, we used to
allocate (N x M) "struct diff_score" that record how similar
each of the pair is, and picked the <src,dst> pair that gives
the best match first, and then went on to process worse matches.
This sorting is done so that when two new files in the postimage
that are similar to the same file deleted from the preimage, we
can process the more similar one first, and when processing the
second one, it can notice "Ah, the source I was planning to say
I am a copy of is already taken by somebody else" and continue
on to match itself with another file in the preimage with a
lessor match. This matters to a change introduced between
1.5.3.X series and 1.5.4-rc, that lets the code to favor unused
matches first and then falls back to using already used
matches.
This instead allocates and keeps only a handful rename source
candidates per new files in the postimage. I.e. it makes the
memory requirement from O(N x M) to O(M).
For each dst, we compute similarlity with all sources (i.e. the
number of similarity estimate computations is still O(N x M)),
but we keep handful best src candidates for each dst.
Signed-off-by: Junio C Hamano <gitster@pobox.com>
2008-01-30 07:54:56 +03:00
|
|
|
mx = xcalloc(num_create * NUM_CANDIDATE_PER_DST, sizeof(*mx));
|
2005-05-24 12:10:48 +04:00
|
|
|
for (dst_cnt = i = 0; i < rename_dst_nr; i++) {
|
|
|
|
struct diff_filespec *two = rename_dst[i].two;
|
Optimize rename detection for a huge diff
When there are N deleted paths and M created paths, we used to
allocate (N x M) "struct diff_score" that record how similar
each of the pair is, and picked the <src,dst> pair that gives
the best match first, and then went on to process worse matches.
This sorting is done so that when two new files in the postimage
that are similar to the same file deleted from the preimage, we
can process the more similar one first, and when processing the
second one, it can notice "Ah, the source I was planning to say
I am a copy of is already taken by somebody else" and continue
on to match itself with another file in the preimage with a
lessor match. This matters to a change introduced between
1.5.3.X series and 1.5.4-rc, that lets the code to favor unused
matches first and then falls back to using already used
matches.
This instead allocates and keeps only a handful rename source
candidates per new files in the postimage. I.e. it makes the
memory requirement from O(N x M) to O(M).
For each dst, we compute similarlity with all sources (i.e. the
number of similarity estimate computations is still O(N x M)),
but we keep handful best src candidates for each dst.
Signed-off-by: Junio C Hamano <gitster@pobox.com>
2008-01-30 07:54:56 +03:00
|
|
|
struct diff_score *m;
|
|
|
|
|
2005-05-24 12:10:48 +04:00
|
|
|
if (rename_dst[i].pair)
|
2005-05-21 13:39:09 +04:00
|
|
|
continue; /* dealt with exact match already. */
|
Optimize rename detection for a huge diff
When there are N deleted paths and M created paths, we used to
allocate (N x M) "struct diff_score" that record how similar
each of the pair is, and picked the <src,dst> pair that gives
the best match first, and then went on to process worse matches.
This sorting is done so that when two new files in the postimage
that are similar to the same file deleted from the preimage, we
can process the more similar one first, and when processing the
second one, it can notice "Ah, the source I was planning to say
I am a copy of is already taken by somebody else" and continue
on to match itself with another file in the preimage with a
lessor match. This matters to a change introduced between
1.5.3.X series and 1.5.4-rc, that lets the code to favor unused
matches first and then falls back to using already used
matches.
This instead allocates and keeps only a handful rename source
candidates per new files in the postimage. I.e. it makes the
memory requirement from O(N x M) to O(M).
For each dst, we compute similarlity with all sources (i.e. the
number of similarity estimate computations is still O(N x M)),
but we keep handful best src candidates for each dst.
Signed-off-by: Junio C Hamano <gitster@pobox.com>
2008-01-30 07:54:56 +03:00
|
|
|
|
|
|
|
m = &mx[dst_cnt * NUM_CANDIDATE_PER_DST];
|
|
|
|
for (j = 0; j < NUM_CANDIDATE_PER_DST; j++)
|
|
|
|
m[j].dst = -1;
|
|
|
|
|
2005-05-24 12:10:48 +04:00
|
|
|
for (j = 0; j < rename_src_nr; j++) {
|
2011-01-07 00:50:05 +03:00
|
|
|
struct diff_filespec *one = rename_src[j].p->one;
|
Optimize rename detection for a huge diff
When there are N deleted paths and M created paths, we used to
allocate (N x M) "struct diff_score" that record how similar
each of the pair is, and picked the <src,dst> pair that gives
the best match first, and then went on to process worse matches.
This sorting is done so that when two new files in the postimage
that are similar to the same file deleted from the preimage, we
can process the more similar one first, and when processing the
second one, it can notice "Ah, the source I was planning to say
I am a copy of is already taken by somebody else" and continue
on to match itself with another file in the preimage with a
lessor match. This matters to a change introduced between
1.5.3.X series and 1.5.4-rc, that lets the code to favor unused
matches first and then falls back to using already used
matches.
This instead allocates and keeps only a handful rename source
candidates per new files in the postimage. I.e. it makes the
memory requirement from O(N x M) to O(M).
For each dst, we compute similarlity with all sources (i.e. the
number of similarity estimate computations is still O(N x M)),
but we keep handful best src candidates for each dst.
Signed-off-by: Junio C Hamano <gitster@pobox.com>
2008-01-30 07:54:56 +03:00
|
|
|
struct diff_score this_src;
|
2011-01-07 00:50:06 +03:00
|
|
|
|
|
|
|
if (skip_unmodified &&
|
|
|
|
diff_unmodified_pair(rename_src[j].p))
|
|
|
|
continue;
|
|
|
|
|
Optimize rename detection for a huge diff
When there are N deleted paths and M created paths, we used to
allocate (N x M) "struct diff_score" that record how similar
each of the pair is, and picked the <src,dst> pair that gives
the best match first, and then went on to process worse matches.
This sorting is done so that when two new files in the postimage
that are similar to the same file deleted from the preimage, we
can process the more similar one first, and when processing the
second one, it can notice "Ah, the source I was planning to say
I am a copy of is already taken by somebody else" and continue
on to match itself with another file in the preimage with a
lessor match. This matters to a change introduced between
1.5.3.X series and 1.5.4-rc, that lets the code to favor unused
matches first and then falls back to using already used
matches.
This instead allocates and keeps only a handful rename source
candidates per new files in the postimage. I.e. it makes the
memory requirement from O(N x M) to O(M).
For each dst, we compute similarlity with all sources (i.e. the
number of similarity estimate computations is still O(N x M)),
but we keep handful best src candidates for each dst.
Signed-off-by: Junio C Hamano <gitster@pobox.com>
2008-01-30 07:54:56 +03:00
|
|
|
this_src.score = estimate_similarity(one, two,
|
|
|
|
minimum_score);
|
|
|
|
this_src.name_score = basename_same(one, two);
|
|
|
|
this_src.dst = i;
|
|
|
|
this_src.src = j;
|
|
|
|
record_if_better(m, &this_src);
|
2009-11-21 09:13:47 +03:00
|
|
|
/*
|
|
|
|
* Once we run estimate_similarity,
|
|
|
|
* We do not need the text anymore.
|
|
|
|
*/
|
2007-10-03 08:01:03 +04:00
|
|
|
diff_free_filespec_blob(one);
|
2009-11-21 09:13:47 +03:00
|
|
|
diff_free_filespec_blob(two);
|
2005-05-21 13:39:09 +04:00
|
|
|
}
|
|
|
|
dst_cnt++;
|
2011-02-20 12:51:16 +03:00
|
|
|
display_progress(progress, (i+1)*rename_src_nr);
|
2005-05-21 13:39:09 +04:00
|
|
|
}
|
2011-02-20 12:51:16 +03:00
|
|
|
stop_progress(&progress);
|
Optimize rename detection for a huge diff
When there are N deleted paths and M created paths, we used to
allocate (N x M) "struct diff_score" that record how similar
each of the pair is, and picked the <src,dst> pair that gives
the best match first, and then went on to process worse matches.
This sorting is done so that when two new files in the postimage
that are similar to the same file deleted from the preimage, we
can process the more similar one first, and when processing the
second one, it can notice "Ah, the source I was planning to say
I am a copy of is already taken by somebody else" and continue
on to match itself with another file in the preimage with a
lessor match. This matters to a change introduced between
1.5.3.X series and 1.5.4-rc, that lets the code to favor unused
matches first and then falls back to using already used
matches.
This instead allocates and keeps only a handful rename source
candidates per new files in the postimage. I.e. it makes the
memory requirement from O(N x M) to O(M).
For each dst, we compute similarlity with all sources (i.e. the
number of similarity estimate computations is still O(N x M)),
but we keep handful best src candidates for each dst.
Signed-off-by: Junio C Hamano <gitster@pobox.com>
2008-01-30 07:54:56 +03:00
|
|
|
|
2005-05-21 13:39:09 +04:00
|
|
|
/* cost matrix sorted by most to least similar pair */
|
Optimize rename detection for a huge diff
When there are N deleted paths and M created paths, we used to
allocate (N x M) "struct diff_score" that record how similar
each of the pair is, and picked the <src,dst> pair that gives
the best match first, and then went on to process worse matches.
This sorting is done so that when two new files in the postimage
that are similar to the same file deleted from the preimage, we
can process the more similar one first, and when processing the
second one, it can notice "Ah, the source I was planning to say
I am a copy of is already taken by somebody else" and continue
on to match itself with another file in the preimage with a
lessor match. This matters to a change introduced between
1.5.3.X series and 1.5.4-rc, that lets the code to favor unused
matches first and then falls back to using already used
matches.
This instead allocates and keeps only a handful rename source
candidates per new files in the postimage. I.e. it makes the
memory requirement from O(N x M) to O(M).
For each dst, we compute similarlity with all sources (i.e. the
number of similarity estimate computations is still O(N x M)),
but we keep handful best src candidates for each dst.
Signed-off-by: Junio C Hamano <gitster@pobox.com>
2008-01-30 07:54:56 +03:00
|
|
|
qsort(mx, dst_cnt * NUM_CANDIDATE_PER_DST, sizeof(*mx), score_compare);
|
|
|
|
|
2011-02-19 07:10:32 +03:00
|
|
|
rename_count += find_renames(mx, dst_cnt, minimum_score, 0);
|
|
|
|
if (detect_rename == DIFF_DETECT_COPY)
|
|
|
|
rename_count += find_renames(mx, dst_cnt, minimum_score, 1);
|
2005-05-21 13:39:09 +04:00
|
|
|
free(mx);
|
|
|
|
|
2005-05-28 02:55:55 +04:00
|
|
|
cleanup:
|
2005-05-21 13:39:09 +04:00
|
|
|
/* At this point, we have found some renames and copies and they
|
2005-09-16 03:13:43 +04:00
|
|
|
* are recorded in rename_dst. The original list is still in *q.
|
2005-05-21 13:39:09 +04:00
|
|
|
*/
|
2010-05-07 08:52:27 +04:00
|
|
|
DIFF_QUEUE_CLEAR(&outq);
|
2005-05-21 13:39:09 +04:00
|
|
|
for (i = 0; i < q->nr; i++) {
|
2005-05-24 12:10:48 +04:00
|
|
|
struct diff_filepair *p = q->queue[i];
|
|
|
|
struct diff_filepair *pair_to_free = NULL;
|
|
|
|
|
diffcore-rename: don't consider unmerged path as source
Since e9c8409 (diff-index --cached --raw: show tree entry on the LHS for
unmerged entries., 2007-01-05), an unmerged entry should be detected by
using DIFF_PAIR_UNMERGED(p), not by noticing both one and two sides of
the filepair records mode=0 entries. However, it forgot to update some
parts of the rename detection logic.
This only makes difference in the "diff --cached" codepath where an
unmerged filepair carries information on the entries that came from the
tree. It probably hasn't been noticed for a long time because nobody
would run "diff -M" during a conflict resolution, but "git status" uses
rename detection when it internally runs "diff-index" and "diff-files"
and gives nonsense results.
In an unmerged pair, "one" side can have a valid filespec to record the
tree entry (e.g. what's in HEAD) when running "diff --cached". This can
be used as a rename source to other paths in the index that are not
unmerged. The path that is unmerged by definition does not have the
final content yet (i.e. "two" side cannot have a valid filespec), so it
can never be a rename destination.
Use the DIFF_PAIR_UNMERGED() to detect unmerged filepair correctly, and
allow the valid "one" side of an unmerged filepair to be considered a
potential rename source, but never to be considered a rename destination.
Commit message and first two test cases by Junio, the rest by Martin.
Signed-off-by: Martin von Zweigbergk <martin.von.zweigbergk@gmail.com>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
2011-03-24 05:41:01 +03:00
|
|
|
if (DIFF_PAIR_UNMERGED(p)) {
|
|
|
|
diff_q(&outq, p);
|
|
|
|
}
|
|
|
|
else if (!DIFF_FILE_VALID(p->one) && DIFF_FILE_VALID(p->two)) {
|
2005-05-30 11:08:07 +04:00
|
|
|
/*
|
|
|
|
* Creation
|
|
|
|
*
|
|
|
|
* We would output this create record if it has
|
|
|
|
* not been turned into a rename/copy already.
|
|
|
|
*/
|
|
|
|
struct diff_rename_dst *dst =
|
|
|
|
locate_rename_dst(p->two, 0);
|
|
|
|
if (dst && dst->pair) {
|
2005-05-24 12:10:48 +04:00
|
|
|
diff_q(&outq, dst->pair);
|
|
|
|
pair_to_free = p;
|
|
|
|
}
|
|
|
|
else
|
2005-05-30 11:08:07 +04:00
|
|
|
/* no matching rename/copy source, so
|
|
|
|
* record this as a creation.
|
2005-05-24 12:10:48 +04:00
|
|
|
*/
|
|
|
|
diff_q(&outq, p);
|
2005-05-21 13:39:09 +04:00
|
|
|
}
|
2005-05-30 11:08:07 +04:00
|
|
|
else if (DIFF_FILE_VALID(p->one) && !DIFF_FILE_VALID(p->two)) {
|
|
|
|
/*
|
|
|
|
* Deletion
|
|
|
|
*
|
[PATCH] Add -B flag to diff-* brothers.
A new diffcore transformation, diffcore-break.c, is introduced.
When the -B flag is given, a patch that represents a complete
rewrite is broken into a deletion followed by a creation. This
makes it easier to review such a complete rewrite patch.
The -B flag takes the same syntax as the -M and -C flags to
specify the minimum amount of non-source material the resulting
file needs to have to be considered a complete rewrite, and
defaults to 99% if not specified.
As the new test t4008-diff-break-rewrite.sh demonstrates, if a
file is a complete rewrite, it is broken into a delete/create
pair, which can further be subjected to the usual rename
detection if -M or -C is used. For example, if file0 gets
completely rewritten to make it as if it were rather based on
file1 which itself disappeared, the following happens:
The original change looks like this:
file0 --> file0' (quite different from file0)
file1 --> /dev/null
After diffcore-break runs, it would become this:
file0 --> /dev/null
/dev/null --> file0'
file1 --> /dev/null
Then diffcore-rename matches them up:
file1 --> file0'
The internal score values are finer grained now. Earlier
maximum of 10000 has been raised to 60000; there is no user
visible changes but there is no reason to waste available bits.
Signed-off-by: Junio C Hamano <junkio@cox.net>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2005-05-30 11:08:37 +04:00
|
|
|
* We would output this delete record if:
|
|
|
|
*
|
|
|
|
* (1) this is a broken delete and the counterpart
|
|
|
|
* broken create remains in the output; or
|
2005-09-16 03:13:43 +04:00
|
|
|
* (2) this is not a broken delete, and rename_dst
|
|
|
|
* does not have a rename/copy to move p->one->path
|
|
|
|
* out of existence.
|
[PATCH] Add -B flag to diff-* brothers.
A new diffcore transformation, diffcore-break.c, is introduced.
When the -B flag is given, a patch that represents a complete
rewrite is broken into a deletion followed by a creation. This
makes it easier to review such a complete rewrite patch.
The -B flag takes the same syntax as the -M and -C flags to
specify the minimum amount of non-source material the resulting
file needs to have to be considered a complete rewrite, and
defaults to 99% if not specified.
As the new test t4008-diff-break-rewrite.sh demonstrates, if a
file is a complete rewrite, it is broken into a delete/create
pair, which can further be subjected to the usual rename
detection if -M or -C is used. For example, if file0 gets
completely rewritten to make it as if it were rather based on
file1 which itself disappeared, the following happens:
The original change looks like this:
file0 --> file0' (quite different from file0)
file1 --> /dev/null
After diffcore-break runs, it would become this:
file0 --> /dev/null
/dev/null --> file0'
file1 --> /dev/null
Then diffcore-rename matches them up:
file1 --> file0'
The internal score values are finer grained now. Earlier
maximum of 10000 has been raised to 60000; there is no user
visible changes but there is no reason to waste available bits.
Signed-off-by: Junio C Hamano <junkio@cox.net>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2005-05-30 11:08:37 +04:00
|
|
|
*
|
|
|
|
* Otherwise, the counterpart broken create
|
|
|
|
* has been turned into a rename-edit; or
|
|
|
|
* delete did not have a matching create to
|
|
|
|
* begin with.
|
2005-05-30 11:08:07 +04:00
|
|
|
*/
|
[PATCH] Add -B flag to diff-* brothers.
A new diffcore transformation, diffcore-break.c, is introduced.
When the -B flag is given, a patch that represents a complete
rewrite is broken into a deletion followed by a creation. This
makes it easier to review such a complete rewrite patch.
The -B flag takes the same syntax as the -M and -C flags to
specify the minimum amount of non-source material the resulting
file needs to have to be considered a complete rewrite, and
defaults to 99% if not specified.
As the new test t4008-diff-break-rewrite.sh demonstrates, if a
file is a complete rewrite, it is broken into a delete/create
pair, which can further be subjected to the usual rename
detection if -M or -C is used. For example, if file0 gets
completely rewritten to make it as if it were rather based on
file1 which itself disappeared, the following happens:
The original change looks like this:
file0 --> file0' (quite different from file0)
file1 --> /dev/null
After diffcore-break runs, it would become this:
file0 --> /dev/null
/dev/null --> file0'
file1 --> /dev/null
Then diffcore-rename matches them up:
file1 --> file0'
The internal score values are finer grained now. Earlier
maximum of 10000 has been raised to 60000; there is no user
visible changes but there is no reason to waste available bits.
Signed-off-by: Junio C Hamano <junkio@cox.net>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2005-05-30 11:08:37 +04:00
|
|
|
if (DIFF_PAIR_BROKEN(p)) {
|
|
|
|
/* broken delete */
|
|
|
|
struct diff_rename_dst *dst =
|
|
|
|
locate_rename_dst(p->one, 0);
|
|
|
|
if (dst && dst->pair)
|
|
|
|
/* counterpart is now rename/copy */
|
|
|
|
pair_to_free = p;
|
|
|
|
}
|
|
|
|
else {
|
2007-10-25 22:20:56 +04:00
|
|
|
if (p->one->rename_used)
|
[PATCH] Add -B flag to diff-* brothers.
A new diffcore transformation, diffcore-break.c, is introduced.
When the -B flag is given, a patch that represents a complete
rewrite is broken into a deletion followed by a creation. This
makes it easier to review such a complete rewrite patch.
The -B flag takes the same syntax as the -M and -C flags to
specify the minimum amount of non-source material the resulting
file needs to have to be considered a complete rewrite, and
defaults to 99% if not specified.
As the new test t4008-diff-break-rewrite.sh demonstrates, if a
file is a complete rewrite, it is broken into a delete/create
pair, which can further be subjected to the usual rename
detection if -M or -C is used. For example, if file0 gets
completely rewritten to make it as if it were rather based on
file1 which itself disappeared, the following happens:
The original change looks like this:
file0 --> file0' (quite different from file0)
file1 --> /dev/null
After diffcore-break runs, it would become this:
file0 --> /dev/null
/dev/null --> file0'
file1 --> /dev/null
Then diffcore-rename matches them up:
file1 --> file0'
The internal score values are finer grained now. Earlier
maximum of 10000 has been raised to 60000; there is no user
visible changes but there is no reason to waste available bits.
Signed-off-by: Junio C Hamano <junkio@cox.net>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2005-05-30 11:08:37 +04:00
|
|
|
/* this path remains */
|
|
|
|
pair_to_free = p;
|
|
|
|
}
|
2005-05-30 11:08:07 +04:00
|
|
|
|
|
|
|
if (pair_to_free)
|
|
|
|
;
|
|
|
|
else
|
|
|
|
diff_q(&outq, p);
|
|
|
|
}
|
2005-05-24 12:10:48 +04:00
|
|
|
else if (!diff_unmodified_pair(p))
|
2005-05-28 02:55:55 +04:00
|
|
|
/* all the usual ones need to be kept */
|
2005-05-24 12:10:48 +04:00
|
|
|
diff_q(&outq, p);
|
2005-05-28 02:55:55 +04:00
|
|
|
else
|
|
|
|
/* no need to keep unmodified pairs */
|
|
|
|
pair_to_free = p;
|
|
|
|
|
2005-05-28 02:50:30 +04:00
|
|
|
if (pair_to_free)
|
|
|
|
diff_free_filepair(pair_to_free);
|
2005-05-21 13:39:09 +04:00
|
|
|
}
|
2005-05-24 12:10:48 +04:00
|
|
|
diff_debug_queue("done copying original", &outq);
|
2005-05-21 13:39:09 +04:00
|
|
|
|
2005-05-24 12:10:48 +04:00
|
|
|
free(q->queue);
|
|
|
|
*q = outq;
|
|
|
|
diff_debug_queue("done collapsing", q);
|
2005-05-21 13:39:09 +04:00
|
|
|
|
2007-10-25 22:19:10 +04:00
|
|
|
for (i = 0; i < rename_dst_nr; i++)
|
|
|
|
free_filespec(rename_dst[i].two);
|
2005-09-16 03:13:43 +04:00
|
|
|
|
2005-05-24 12:10:48 +04:00
|
|
|
free(rename_dst);
|
|
|
|
rename_dst = NULL;
|
|
|
|
rename_dst_nr = rename_dst_alloc = 0;
|
|
|
|
free(rename_src);
|
|
|
|
rename_src = NULL;
|
|
|
|
rename_src_nr = rename_src_alloc = 0;
|
2005-05-21 13:39:09 +04:00
|
|
|
return;
|
|
|
|
}
|