Merge branch 'en/ort-perf-batch-8'

Rename detection rework continues.

* en/ort-perf-batch-8:
  diffcore-rename: compute dir_rename_guess from dir_rename_counts
  diffcore-rename: limit dir_rename_counts computation to relevant dirs
  diffcore-rename: compute dir_rename_counts in stages
  diffcore-rename: extend cleanup_dir_rename_info()
  diffcore-rename: move dir_rename_counts into dir_rename_info struct
  diffcore-rename: add function for clearing dir_rename_count
  Move computation of dir_rename_count from merge-ort to diffcore-rename
  diffcore-rename: add a mapping of destination names to their indices
  diffcore-rename: provide basic implementation of idx_possible_rename()
  diffcore-rename: use directory rename guided basename comparisons
This commit is contained in:
Junio C Hamano 2021-03-22 14:00:23 -07:00
Родитель 24119d9d7b 81afdf7a2e
Коммит dd4048d1c7
4 изменённых файлов: 449 добавлений и 153 удалений

Просмотреть файл

@ -187,7 +187,7 @@ mark a file pair as a rename and stop considering other candidates for
better matches. At most, one comparison is done per file in this
preliminary pass; so if there are several remaining ext.txt files
throughout the directory hierarchy after exact rename detection, this
preliminary step will be skipped for those files.
preliminary step may be skipped for those files.
Note. When the "-C" option is used with `--find-copies-harder`
option, 'git diff-{asterisk}' commands feed unmodified filepairs to

Просмотреть файл

@ -367,6 +367,296 @@ static int find_exact_renames(struct diff_options *options)
return renames;
}
struct dir_rename_info {
struct strintmap idx_map;
struct strmap dir_rename_guess;
struct strmap *dir_rename_count;
struct strset *relevant_source_dirs;
unsigned setup;
};
static char *get_dirname(const char *filename)
{
char *slash = strrchr(filename, '/');
return slash ? xstrndup(filename, slash - filename) : xstrdup("");
}
static void dirname_munge(char *filename)
{
char *slash = strrchr(filename, '/');
if (!slash)
slash = filename;
*slash = '\0';
}
static const char *get_highest_rename_path(struct strintmap *counts)
{
int highest_count = 0;
const char *highest_destination_dir = NULL;
struct hashmap_iter iter;
struct strmap_entry *entry;
strintmap_for_each_entry(counts, &iter, entry) {
const char *destination_dir = entry->key;
intptr_t count = (intptr_t)entry->value;
if (count > highest_count) {
highest_count = count;
highest_destination_dir = destination_dir;
}
}
return highest_destination_dir;
}
static void increment_count(struct dir_rename_info *info,
char *old_dir,
char *new_dir)
{
struct strintmap *counts;
struct strmap_entry *e;
/* Get the {new_dirs -> counts} mapping using old_dir */
e = strmap_get_entry(info->dir_rename_count, old_dir);
if (e) {
counts = e->value;
} else {
counts = xmalloc(sizeof(*counts));
strintmap_init_with_options(counts, 0, NULL, 1);
strmap_put(info->dir_rename_count, old_dir, counts);
}
/* Increment the count for new_dir */
strintmap_incr(counts, new_dir, 1);
}
static void update_dir_rename_counts(struct dir_rename_info *info,
struct strset *dirs_removed,
const char *oldname,
const char *newname)
{
char *old_dir = xstrdup(oldname);
char *new_dir = xstrdup(newname);
char new_dir_first_char = new_dir[0];
int first_time_in_loop = 1;
if (!info->setup)
/*
* info->setup is 0 here in two cases: (1) all auxiliary
* vars (like dirs_removed) were NULL so
* initialize_dir_rename_info() returned early, or (2)
* either break detection or copy detection are active so
* that we never called initialize_dir_rename_info(). In
* the former case, we don't have enough info to know if
* directories were renamed (because dirs_removed lets us
* know about a necessary prerequisite, namely if they were
* removed), and in the latter, we don't care about
* directory renames or find_basename_matches.
*
* This matters because both basename and inexact matching
* will also call update_dir_rename_counts(). In either of
* the above two cases info->dir_rename_counts will not
* have been properly initialized which prevents us from
* updating it, but in these two cases we don't care about
* dir_rename_counts anyway, so we can just exit early.
*/
return;
while (1) {
/* Get old_dir, skip if its directory isn't relevant. */
dirname_munge(old_dir);
if (info->relevant_source_dirs &&
!strset_contains(info->relevant_source_dirs, old_dir))
break;
/* Get new_dir */
dirname_munge(new_dir);
/*
* When renaming
* "a/b/c/d/e/foo.c" -> "a/b/some/thing/else/e/foo.c"
* then this suggests that both
* a/b/c/d/e/ => a/b/some/thing/else/e/
* a/b/c/d/ => a/b/some/thing/else/
* so we want to increment counters for both. We do NOT,
* however, also want to suggest that there was the following
* rename:
* a/b/c/ => a/b/some/thing/
* so we need to quit at that point.
*
* Note the when first_time_in_loop, we only strip off the
* basename, and we don't care if that's different.
*/
if (!first_time_in_loop) {
char *old_sub_dir = strchr(old_dir, '\0')+1;
char *new_sub_dir = strchr(new_dir, '\0')+1;
if (!*new_dir) {
/*
* Special case when renaming to root directory,
* i.e. when new_dir == "". In this case, we had
* something like
* a/b/subdir => subdir
* and so dirname_munge() sets things up so that
* old_dir = "a/b\0subdir\0"
* new_dir = "\0ubdir\0"
* We didn't have a '/' to overwrite a '\0' onto
* in new_dir, so we have to compare differently.
*/
if (new_dir_first_char != old_sub_dir[0] ||
strcmp(old_sub_dir+1, new_sub_dir))
break;
} else {
if (strcmp(old_sub_dir, new_sub_dir))
break;
}
}
if (strset_contains(dirs_removed, old_dir))
increment_count(info, old_dir, new_dir);
else
break;
/* If we hit toplevel directory ("") for old or new dir, quit */
if (!*old_dir || !*new_dir)
break;
first_time_in_loop = 0;
}
/* Free resources we don't need anymore */
free(old_dir);
free(new_dir);
}
static void initialize_dir_rename_info(struct dir_rename_info *info,
struct strset *dirs_removed,
struct strmap *dir_rename_count)
{
struct hashmap_iter iter;
struct strmap_entry *entry;
int i;
if (!dirs_removed) {
info->setup = 0;
return;
}
info->setup = 1;
info->dir_rename_count = dir_rename_count;
if (!info->dir_rename_count) {
info->dir_rename_count = xmalloc(sizeof(*dir_rename_count));
strmap_init(info->dir_rename_count);
}
strintmap_init_with_options(&info->idx_map, -1, NULL, 0);
strmap_init_with_options(&info->dir_rename_guess, NULL, 0);
/* Setup info->relevant_source_dirs */
info->relevant_source_dirs = dirs_removed;
/*
* Loop setting up both info->idx_map, and doing setup of
* info->dir_rename_count.
*/
for (i = 0; i < rename_dst_nr; ++i) {
/*
* For non-renamed files, make idx_map contain mapping of
* filename -> index (index within rename_dst, that is)
*/
if (!rename_dst[i].is_rename) {
char *filename = rename_dst[i].p->two->path;
strintmap_set(&info->idx_map, filename, i);
continue;
}
/*
* For everything else (i.e. renamed files), make
* dir_rename_count contain a map of a map:
* old_directory -> {new_directory -> count}
* In other words, for every pair look at the directories for
* the old filename and the new filename and count how many
* times that pairing occurs.
*/
update_dir_rename_counts(info, dirs_removed,
rename_dst[i].p->one->path,
rename_dst[i].p->two->path);
}
/*
* Now we collapse
* dir_rename_count: old_directory -> {new_directory -> count}
* down to
* dir_rename_guess: old_directory -> best_new_directory
* where best_new_directory is the one with the highest count.
*/
strmap_for_each_entry(info->dir_rename_count, &iter, entry) {
/* entry->key is source_dir */
struct strintmap *counts = entry->value;
char *best_newdir;
best_newdir = xstrdup(get_highest_rename_path(counts));
strmap_put(&info->dir_rename_guess, entry->key,
best_newdir);
}
}
void partial_clear_dir_rename_count(struct strmap *dir_rename_count)
{
struct hashmap_iter iter;
struct strmap_entry *entry;
strmap_for_each_entry(dir_rename_count, &iter, entry) {
struct strintmap *counts = entry->value;
strintmap_clear(counts);
}
strmap_partial_clear(dir_rename_count, 1);
}
static void cleanup_dir_rename_info(struct dir_rename_info *info,
struct strset *dirs_removed,
int keep_dir_rename_count)
{
struct hashmap_iter iter;
struct strmap_entry *entry;
struct string_list to_remove = STRING_LIST_INIT_NODUP;
int i;
if (!info->setup)
return;
/* idx_map */
strintmap_clear(&info->idx_map);
/* dir_rename_guess */
strmap_clear(&info->dir_rename_guess, 1);
/* dir_rename_count */
if (!keep_dir_rename_count) {
partial_clear_dir_rename_count(info->dir_rename_count);
strmap_clear(info->dir_rename_count, 1);
FREE_AND_NULL(info->dir_rename_count);
return;
}
/*
* Although dir_rename_count was passed in
* diffcore_rename_extended() and we want to keep it around and
* return it to that caller, we first want to remove any data
* associated with directories that weren't renamed.
*/
strmap_for_each_entry(info->dir_rename_count, &iter, entry) {
const char *source_dir = entry->key;
struct strintmap *counts = entry->value;
if (!strset_contains(dirs_removed, source_dir)) {
string_list_append(&to_remove, source_dir);
strintmap_clear(counts);
continue;
}
}
for (i = 0; i < to_remove.nr; ++i)
strmap_remove(info->dir_rename_count,
to_remove.items[i].string, 1);
string_list_clear(&to_remove, 0);
}
static const char *get_basename(const char *filename)
{
/*
@ -379,8 +669,87 @@ static const char *get_basename(const char *filename)
return base ? base + 1 : filename;
}
static int idx_possible_rename(char *filename, struct dir_rename_info *info)
{
/*
* Our comparison of files with the same basename (see
* find_basename_matches() below), is only helpful when after exact
* rename detection we have exactly one file with a given basename
* among the rename sources and also only exactly one file with
* that basename among the rename destinations. When we have
* multiple files with the same basename in either set, we do not
* know which to compare against. However, there are some
* filenames that occur in large numbers (particularly
* build-related filenames such as 'Makefile', '.gitignore', or
* 'build.gradle' that potentially exist within every single
* subdirectory), and for performance we want to be able to quickly
* find renames for these files too.
*
* The reason basename comparisons are a useful heuristic was that it
* is common for people to move files across directories while keeping
* their filename the same. If we had a way of determining or even
* making a good educated guess about which directory these non-unique
* basename files had moved the file to, we could check it.
* Luckily...
*
* When an entire directory is in fact renamed, we have two factors
* helping us out:
* (a) the original directory disappeared giving us a hint
* about when we can apply an extra heuristic.
* (a) we often have several files within that directory and
* subdirectories that are renamed without changes
* So, rules for a heuristic:
* (0) If there basename matches are non-unique (the condition under
* which this function is called) AND
* (1) the directory in which the file was found has disappeared
* (i.e. dirs_removed is non-NULL and has a relevant entry) THEN
* (2) use exact renames of files within the directory to determine
* where the directory is likely to have been renamed to. IF
* there is at least one exact rename from within that
* directory, we can proceed.
* (3) If there are multiple places the directory could have been
* renamed to based on exact renames, ignore all but one of them.
* Just use the destination with the most renames going to it.
* (4) Check if applying that directory rename to the original file
* would result in a destination filename that is in the
* potential rename set. If so, return the index of the
* destination file (the index within rename_dst).
* (5) Compare the original file and returned destination for
* similarity, and if they are sufficiently similar, record the
* rename.
*
* This function, idx_possible_rename(), is only responsible for (4).
* The conditions/steps in (1)-(3) are handled via setting up
* dir_rename_count and dir_rename_guess in
* initialize_dir_rename_info(). Steps (0) and (5) are handled by
* the caller of this function.
*/
char *old_dir, *new_dir;
struct strbuf new_path = STRBUF_INIT;
int idx;
if (!info->setup)
return -1;
old_dir = get_dirname(filename);
new_dir = strmap_get(&info->dir_rename_guess, old_dir);
free(old_dir);
if (!new_dir)
return -1;
strbuf_addstr(&new_path, new_dir);
strbuf_addch(&new_path, '/');
strbuf_addstr(&new_path, get_basename(filename));
idx = strintmap_get(&info->idx_map, new_path.buf);
strbuf_release(&new_path);
return idx;
}
static int find_basename_matches(struct diff_options *options,
int minimum_score)
int minimum_score,
struct dir_rename_info *info,
struct strset *dirs_removed)
{
/*
* When I checked in early 2020, over 76% of file renames in linux
@ -415,8 +784,6 @@ static int find_basename_matches(struct diff_options *options,
int i, renames = 0;
struct strintmap sources;
struct strintmap dests;
struct hashmap_iter iter;
struct strmap_entry *entry;
/*
* The prefeteching stuff wants to know if it can skip prefetching
@ -466,17 +833,39 @@ static int find_basename_matches(struct diff_options *options,
}
/* Now look for basename matchups and do similarity estimation */
strintmap_for_each_entry(&sources, &iter, entry) {
const char *base = entry->key;
intptr_t src_index = (intptr_t)entry->value;
for (i = 0; i < rename_src_nr; ++i) {
char *filename = rename_src[i].p->one->path;
const char *base = NULL;
intptr_t src_index;
intptr_t dst_index;
if (src_index == -1)
continue;
if (0 <= (dst_index = strintmap_get(&dests, base))) {
/*
* If the basename is unique among remaining sources, then
* src_index will equal 'i' and we can attempt to match it
* to a unique basename in the destinations. Otherwise,
* use directory rename heuristics, if possible.
*/
base = get_basename(filename);
src_index = strintmap_get(&sources, base);
assert(src_index == -1 || src_index == i);
if (strintmap_contains(&dests, base)) {
struct diff_filespec *one, *two;
int score;
/* Find a matching destination, if possible */
dst_index = strintmap_get(&dests, base);
if (src_index == -1 || dst_index == -1) {
src_index = i;
dst_index = idx_possible_rename(filename, info);
}
if (dst_index == -1)
continue;
/* Ignore this dest if already used in a rename */
if (rename_dst[dst_index].is_rename)
continue; /* already used previously */
/* Estimate the similarity */
one = rename_src[src_index].p->one;
two = rename_dst[dst_index].p->two;
@ -488,6 +877,8 @@ static int find_basename_matches(struct diff_options *options,
continue;
record_rename_pair(dst_index, src_index, score);
renames++;
update_dir_rename_counts(info, dirs_removed,
one->path, two->path);
/*
* Found a rename so don't need text anymore; if we
@ -571,7 +962,12 @@ static int too_many_rename_candidates(int num_destinations, int num_sources,
return 1;
}
static int find_renames(struct diff_score *mx, int dst_cnt, int minimum_score, int copies)
static int find_renames(struct diff_score *mx,
int dst_cnt,
int minimum_score,
int copies,
struct dir_rename_info *info,
struct strset *dirs_removed)
{
int count = 0, i;
@ -588,6 +984,9 @@ static int find_renames(struct diff_score *mx, int dst_cnt, int minimum_score, i
continue;
record_rename_pair(mx[i].dst, mx[i].src, mx[i].score);
count++;
update_dir_rename_counts(info, dirs_removed,
rename_src[mx[i].src].p->one->path,
rename_dst[mx[i].dst].p->two->path);
}
return count;
}
@ -640,7 +1039,9 @@ static void remove_unneeded_paths_from_src(int detecting_copies)
rename_src_nr = new_num_src;
}
void diffcore_rename(struct diff_options *options)
void diffcore_rename_extended(struct diff_options *options,
struct strset *dirs_removed,
struct strmap *dir_rename_count)
{
int detect_rename = options->detect_rename;
int minimum_score = options->rename_score;
@ -651,9 +1052,14 @@ void diffcore_rename(struct diff_options *options)
int num_destinations, dst_cnt;
int num_sources, want_copies;
struct progress *progress = NULL;
struct dir_rename_info info;
trace2_region_enter("diff", "setup", options->repo);
info.setup = 0;
assert(!dir_rename_count || strmap_empty(dir_rename_count));
want_copies = (detect_rename == DIFF_DETECT_COPY);
if (dirs_removed && (break_idx || want_copies))
BUG("dirs_removed incompatible with break/copy detection");
if (!minimum_score)
minimum_score = DEFAULT_RENAME_SCORE;
@ -745,10 +1151,17 @@ void diffcore_rename(struct diff_options *options)
remove_unneeded_paths_from_src(want_copies);
trace2_region_leave("diff", "cull after exact", options->repo);
/* Preparation for basename-driven matching. */
trace2_region_enter("diff", "dir rename setup", options->repo);
initialize_dir_rename_info(&info,
dirs_removed, dir_rename_count);
trace2_region_leave("diff", "dir rename setup", options->repo);
/* Utilize file basenames to quickly find renames. */
trace2_region_enter("diff", "basename matches", options->repo);
rename_count += find_basename_matches(options,
min_basename_score);
min_basename_score,
&info, dirs_removed);
trace2_region_leave("diff", "basename matches", options->repo);
/*
@ -833,9 +1246,11 @@ void diffcore_rename(struct diff_options *options)
/* cost matrix sorted by most to least similar pair */
STABLE_QSORT(mx, dst_cnt * NUM_CANDIDATE_PER_DST, score_compare);
rename_count += find_renames(mx, dst_cnt, minimum_score, 0);
rename_count += find_renames(mx, dst_cnt, minimum_score, 0,
&info, dirs_removed);
if (want_copies)
rename_count += find_renames(mx, dst_cnt, minimum_score, 1);
rename_count += find_renames(mx, dst_cnt, minimum_score, 1,
&info, dirs_removed);
free(mx);
trace2_region_leave("diff", "inexact renames", options->repo);
@ -911,6 +1326,7 @@ void diffcore_rename(struct diff_options *options)
if (rename_dst[i].filespec_to_free)
free_filespec(rename_dst[i].filespec_to_free);
cleanup_dir_rename_info(&info, dirs_removed, dir_rename_count != NULL);
FREE_AND_NULL(rename_dst);
rename_dst_nr = rename_dst_alloc = 0;
FREE_AND_NULL(rename_src);
@ -922,3 +1338,8 @@ void diffcore_rename(struct diff_options *options)
trace2_region_leave("diff", "write back to queue", options->repo);
return;
}
void diffcore_rename(struct diff_options *options)
{
diffcore_rename_extended(options, NULL, NULL);
}

Просмотреть файл

@ -8,6 +8,8 @@
struct diff_options;
struct repository;
struct strmap;
struct strset;
struct userdiff_driver;
/* This header file is internal between diff.c and its diff transformers
@ -159,8 +161,13 @@ struct diff_filepair *diff_queue(struct diff_queue_struct *,
struct diff_filespec *);
void diff_q(struct diff_queue_struct *, struct diff_filepair *);
void partial_clear_dir_rename_count(struct strmap *dir_rename_count);
void diffcore_break(struct repository *, int);
void diffcore_rename(struct diff_options *);
void diffcore_rename_extended(struct diff_options *options,
struct strset *dirs_removed,
struct strmap *dir_rename_count);
void diffcore_merge_broken(void);
void diffcore_pickaxe(struct diff_options *);
void diffcore_order(const char *orderfile);

Просмотреть файл

@ -351,17 +351,11 @@ static void clear_or_reinit_internal_opts(struct merge_options_internal *opti,
/* Free memory used by various renames maps */
for (i = MERGE_SIDE1; i <= MERGE_SIDE2; ++i) {
struct hashmap_iter iter;
struct strmap_entry *entry;
strset_func(&renames->dirs_removed[i]);
strmap_for_each_entry(&renames->dir_rename_count[i],
&iter, entry) {
struct strintmap *counts = entry->value;
strintmap_clear(counts);
}
strmap_func(&renames->dir_rename_count[i], 1);
partial_clear_dir_rename_count(&renames->dir_rename_count[i]);
if (!reinitialize)
strmap_clear(&renames->dir_rename_count[i], 1);
strmap_func(&renames->dir_renames[i], 0);
}
@ -1302,131 +1296,6 @@ static char *handle_path_level_conflicts(struct merge_options *opt,
return new_path;
}
static void dirname_munge(char *filename)
{
char *slash = strrchr(filename, '/');
if (!slash)
slash = filename;
*slash = '\0';
}
static void increment_count(struct strmap *dir_rename_count,
char *old_dir,
char *new_dir)
{
struct strintmap *counts;
struct strmap_entry *e;
/* Get the {new_dirs -> counts} mapping using old_dir */
e = strmap_get_entry(dir_rename_count, old_dir);
if (e) {
counts = e->value;
} else {
counts = xmalloc(sizeof(*counts));
strintmap_init_with_options(counts, 0, NULL, 1);
strmap_put(dir_rename_count, old_dir, counts);
}
/* Increment the count for new_dir */
strintmap_incr(counts, new_dir, 1);
}
static void update_dir_rename_counts(struct strmap *dir_rename_count,
struct strset *dirs_removed,
const char *oldname,
const char *newname)
{
char *old_dir = xstrdup(oldname);
char *new_dir = xstrdup(newname);
char new_dir_first_char = new_dir[0];
int first_time_in_loop = 1;
while (1) {
dirname_munge(old_dir);
dirname_munge(new_dir);
/*
* When renaming
* "a/b/c/d/e/foo.c" -> "a/b/some/thing/else/e/foo.c"
* then this suggests that both
* a/b/c/d/e/ => a/b/some/thing/else/e/
* a/b/c/d/ => a/b/some/thing/else/
* so we want to increment counters for both. We do NOT,
* however, also want to suggest that there was the following
* rename:
* a/b/c/ => a/b/some/thing/
* so we need to quit at that point.
*
* Note the when first_time_in_loop, we only strip off the
* basename, and we don't care if that's different.
*/
if (!first_time_in_loop) {
char *old_sub_dir = strchr(old_dir, '\0')+1;
char *new_sub_dir = strchr(new_dir, '\0')+1;
if (!*new_dir) {
/*
* Special case when renaming to root directory,
* i.e. when new_dir == "". In this case, we had
* something like
* a/b/subdir => subdir
* and so dirname_munge() sets things up so that
* old_dir = "a/b\0subdir\0"
* new_dir = "\0ubdir\0"
* We didn't have a '/' to overwrite a '\0' onto
* in new_dir, so we have to compare differently.
*/
if (new_dir_first_char != old_sub_dir[0] ||
strcmp(old_sub_dir+1, new_sub_dir))
break;
} else {
if (strcmp(old_sub_dir, new_sub_dir))
break;
}
}
if (strset_contains(dirs_removed, old_dir))
increment_count(dir_rename_count, old_dir, new_dir);
else
break;
/* If we hit toplevel directory ("") for old or new dir, quit */
if (!*old_dir || !*new_dir)
break;
first_time_in_loop = 0;
}
/* Free resources we don't need anymore */
free(old_dir);
free(new_dir);
}
static void compute_rename_counts(struct diff_queue_struct *pairs,
struct strmap *dir_rename_count,
struct strset *dirs_removed)
{
int i;
for (i = 0; i < pairs->nr; ++i) {
struct diff_filepair *pair = pairs->queue[i];
/* File not part of directory rename if it wasn't renamed */
if (pair->status != 'R')
continue;
/*
* Make dir_rename_count contain a map of a map:
* old_directory -> {new_directory -> count}
* In other words, for every pair look at the directories for
* the old filename and the new filename and count how many
* times that pairing occurs.
*/
update_dir_rename_counts(dir_rename_count, dirs_removed,
pair->one->path,
pair->two->path);
}
}
static void get_provisional_directory_renames(struct merge_options *opt,
unsigned side,
int *clean)
@ -1435,9 +1304,6 @@ static void get_provisional_directory_renames(struct merge_options *opt,
struct strmap_entry *entry;
struct rename_info *renames = &opt->priv->renames;
compute_rename_counts(&renames->pairs[side],
&renames->dir_rename_count[side],
&renames->dirs_removed[side]);
/*
* Collapse
* dir_rename_count: old_directory -> {new_directory -> count}
@ -2161,7 +2027,9 @@ static void detect_regular_renames(struct merge_options *opt,
diff_queued_diff = renames->pairs[side_index];
trace2_region_enter("diff", "diffcore_rename", opt->repo);
diffcore_rename(&diff_opts);
diffcore_rename_extended(&diff_opts,
&renames->dirs_removed[side_index],
&renames->dir_rename_count[side_index]);
trace2_region_leave("diff", "diffcore_rename", opt->repo);
resolve_diffpair_statuses(&diff_queued_diff);