From 2b21008d3c43d41588e84b45907bb8c2e86278f6 Mon Sep 17 00:00:00 2001 From: Jeff King Date: Mon, 13 Feb 2012 17:28:10 -0500 Subject: [PATCH 1/5] diff-highlight: make perl strict and warnings fatal These perl features can catch bugs, and we shouldn't be violating any of the strict rules or creating any warnings, so let's turn them on. Signed-off-by: Jeff King Signed-off-by: Junio C Hamano --- contrib/diff-highlight/diff-highlight | 3 +++ 1 file changed, 3 insertions(+) diff --git a/contrib/diff-highlight/diff-highlight b/contrib/diff-highlight/diff-highlight index d8938982e4..c3302dd817 100755 --- a/contrib/diff-highlight/diff-highlight +++ b/contrib/diff-highlight/diff-highlight @@ -1,5 +1,8 @@ #!/usr/bin/perl +use warnings FATAL => 'all'; +use strict; + # Highlight by reversing foreground and background. You could do # other things like bold or underline if you prefer. my $HIGHLIGHT = "\x1b[7m"; From 097128d1bce7194702f336e30c5e228aa8ba774f Mon Sep 17 00:00:00 2001 From: Jeff King Date: Mon, 13 Feb 2012 17:32:47 -0500 Subject: [PATCH 2/5] diff-highlight: don't highlight whole lines MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit If you have a change like: -foo +bar we end up highlighting the entirety of both lines (since the whole thing is changed). But the point of diff highlighting is to pinpoint the specific change in a pair of lines that are mostly identical. In this case, the highlighting is just noise, since there is nothing to pinpoint, and we are better off doing nothing. The implementation looks for "interesting" pairs by checking to see whether they actually have a matching prefix or suffix that does not simply consist of colorization and whitespace. However, the implementation makes it easy to plug in other heuristics, too, like: 1. Depending on the source material, the set of "boring" characters could be tweaked to include language-specific stuff (like braces or semicolons for C). 2. Instead of saying "an interesting line has at least one character of prefix or suffix", we could require that less than N percent of the line be highlighted. The simple "ignore whitespace, and highlight if there are any matched characters" implemented by this patch seems to give good results on git.git. I'll leave experimentation with other heuristics to somebody who has a dataset that does not look good with the current code. Based on an original idea and implementation by Michał Kiedrowicz. Signed-off-by: Jeff King Signed-off-by: Junio C Hamano --- contrib/diff-highlight/diff-highlight | 28 +++++++++++++++++++++++++-- 1 file changed, 26 insertions(+), 2 deletions(-) diff --git a/contrib/diff-highlight/diff-highlight b/contrib/diff-highlight/diff-highlight index c3302dd817..0d8df84a20 100755 --- a/contrib/diff-highlight/diff-highlight +++ b/contrib/diff-highlight/diff-highlight @@ -8,6 +8,7 @@ use strict; my $HIGHLIGHT = "\x1b[7m"; my $UNHIGHLIGHT = "\x1b[27m"; my $COLOR = qr/\x1b\[[0-9;]*m/; +my $BORING = qr/$COLOR|\s/; my @window; @@ -104,8 +105,14 @@ sub show_pair { } } - print highlight(\@a, $pa, $sa); - print highlight(\@b, $pb, $sb); + if (is_pair_interesting(\@a, $pa, $sa, \@b, $pb, $sb)) { + print highlight(\@a, $pa, $sa); + print highlight(\@b, $pb, $sb); + } + else { + print join('', @a); + print join('', @b); + } } sub split_line { @@ -125,3 +132,20 @@ sub highlight { @{$line}[($suffix+1)..$#$line] ); } + +# Pairs are interesting to highlight only if we are going to end up +# highlighting a subset (i.e., not the whole line). Otherwise, the highlighting +# is just useless noise. We can detect this by finding either a matching prefix +# or suffix (disregarding boring bits like whitespace and colorization). +sub is_pair_interesting { + my ($a, $pa, $sa, $b, $pb, $sb) = @_; + my $prefix_a = join('', @$a[0..($pa-1)]); + my $prefix_b = join('', @$b[0..($pb-1)]); + my $suffix_a = join('', @$a[($sa+1)..$#$a]); + my $suffix_b = join('', @$b[($sb+1)..$#$b]); + + return $prefix_a !~ /^$COLOR*-$BORING*$/ || + $prefix_b !~ /^$COLOR*\+$BORING*$/ || + $suffix_a !~ /^$BORING*$/ || + $suffix_b !~ /^$BORING*$/; +} From 6463fd7ed189f4100e1102062f29b969384f1436 Mon Sep 17 00:00:00 2001 From: Jeff King Date: Mon, 13 Feb 2012 17:33:10 -0500 Subject: [PATCH 3/5] diff-highlight: refactor to prepare for multi-line hunks The current code structure assumes that we will only look at a pair of lines at any given time, and that the end result should always be to output that pair. However, we want to eventually handle multi-line hunks, which will involve collating pairs of removed/added lines. Let's refactor the code to return highlighted pairs instead of printing them. Signed-off-by: Jeff King Signed-off-by: Junio C Hamano --- contrib/diff-highlight/diff-highlight | 22 ++++++++++++++-------- 1 file changed, 14 insertions(+), 8 deletions(-) diff --git a/contrib/diff-highlight/diff-highlight b/contrib/diff-highlight/diff-highlight index 0d8df84a20..279d21181e 100755 --- a/contrib/diff-highlight/diff-highlight +++ b/contrib/diff-highlight/diff-highlight @@ -23,7 +23,7 @@ while (<>) { $window[2] =~ /^$COLOR*\+/ && $window[3] !~ /^$COLOR*\+/) { print shift @window; - show_pair(shift @window, shift @window); + show_hunk(shift @window, shift @window); } else { print shift @window; @@ -48,7 +48,7 @@ if (@window == 3 && $window[1] =~ /^$COLOR*-/ && $window[2] =~ /^$COLOR*\+/) { print shift @window; - show_pair(shift @window, shift @window); + show_hunk(shift @window, shift @window); } # And then flush any remaining lines. @@ -58,7 +58,13 @@ while (@window) { exit 0; -sub show_pair { +sub show_hunk { + my ($a, $b) = @_; + + print highlight_pair($a, $b); +} + +sub highlight_pair { my @a = split_line(shift); my @b = split_line(shift); @@ -106,12 +112,12 @@ sub show_pair { } if (is_pair_interesting(\@a, $pa, $sa, \@b, $pb, $sb)) { - print highlight(\@a, $pa, $sa); - print highlight(\@b, $pb, $sb); + return highlight_line(\@a, $pa, $sa), + highlight_line(\@b, $pb, $sb); } else { - print join('', @a); - print join('', @b); + return join('', @a), + join('', @b); } } @@ -121,7 +127,7 @@ sub split_line { split /($COLOR*)/; } -sub highlight { +sub highlight_line { my ($line, $prefix, $suffix) = @_; return join('', From 34d9819e0a387be6d49cffe67458036450d6d0d5 Mon Sep 17 00:00:00 2001 From: Jeff King Date: Mon, 13 Feb 2012 17:36:36 -0500 Subject: [PATCH 4/5] diff-highlight: match multi-line hunks MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Currently we only bother highlighting single-line hunks. The rationale was that the purpose of highlighting is to point out small changes between two similar lines that are otherwise hard to see. However, that meant we missed similar cases where two lines were changed together, like: -foo(buf); -bar(buf); +foo(obj->buf); +bar(obj->buf); Each of those changes is simple, and would benefit from highlighting (the "obj->" parts in this case). This patch considers whole hunks at a time. For now, we consider only the case where the hunk has the same number of removed and added lines, and assume that the lines from each segment correspond one-to-one. While this is just a heuristic, in practice it seems to generate sensible results (especially because we now omit highlighting on completely-changed lines, so when our heuristic is wrong, we tend to avoid highlighting at all). Based on an original idea and implementation by Michał Kiedrowicz. Signed-off-by: Jeff King Signed-off-by: Junio C Hamano --- contrib/diff-highlight/README | 16 +++--- contrib/diff-highlight/diff-highlight | 70 ++++++++++++++++----------- 2 files changed, 52 insertions(+), 34 deletions(-) diff --git a/contrib/diff-highlight/README b/contrib/diff-highlight/README index 1b7b6df8eb..4a58579779 100644 --- a/contrib/diff-highlight/README +++ b/contrib/diff-highlight/README @@ -14,13 +14,15 @@ Instead, this script post-processes the line-oriented diff, finds pairs of lines, and highlights the differing segments. It's currently very simple and stupid about doing these tasks. In particular: - 1. It will only highlight a pair of lines if they are the only two - lines in a hunk. It could instead try to match up "before" and - "after" lines for a given hunk into pairs of similar lines. - However, this may end up visually distracting, as the paired - lines would have other highlighted lines in between them. And in - practice, the lines which most need attention called to their - small, hard-to-see changes are touching only a single line. + 1. It will only highlight hunks in which the number of removed and + added lines is the same, and it will pair lines within the hunk by + position (so the first removed line is compared to the first added + line, and so forth). This is simple and tends to work well in + practice. More complex changes don't highlight well, so we tend to + exclude them due to the "same number of removed and added lines" + restriction. Or even if we do try to highlight them, they end up + not highlighting because of our "don't highlight if the whole line + would be highlighted" rule. 2. It will find the common prefix and suffix of two lines, and consider everything in the middle to be "different". It could diff --git a/contrib/diff-highlight/diff-highlight b/contrib/diff-highlight/diff-highlight index 279d21181e..c4404d49c9 100755 --- a/contrib/diff-highlight/diff-highlight +++ b/contrib/diff-highlight/diff-highlight @@ -10,23 +10,28 @@ my $UNHIGHLIGHT = "\x1b[27m"; my $COLOR = qr/\x1b\[[0-9;]*m/; my $BORING = qr/$COLOR|\s/; -my @window; +my @removed; +my @added; +my $in_hunk; while (<>) { - # We highlight only single-line changes, so we need - # a 4-line window to make a decision on whether - # to highlight. - push @window, $_; - next if @window < 4; - if ($window[0] =~ /^$COLOR*(\@| )/ && - $window[1] =~ /^$COLOR*-/ && - $window[2] =~ /^$COLOR*\+/ && - $window[3] !~ /^$COLOR*\+/) { - print shift @window; - show_hunk(shift @window, shift @window); + if (!$in_hunk) { + print; + $in_hunk = /^$COLOR*\@/; + } + elsif (/^$COLOR*-/) { + push @removed, $_; + } + elsif (/^$COLOR*\+/) { + push @added, $_; } else { - print shift @window; + show_hunk(\@removed, \@added); + @removed = (); + @added = (); + + print; + $in_hunk = /^$COLOR*[\@ ]/; } # Most of the time there is enough output to keep things streaming, @@ -42,26 +47,37 @@ while (<>) { } } -# Special case a single-line hunk at the end of file. -if (@window == 3 && - $window[0] =~ /^$COLOR*(\@| )/ && - $window[1] =~ /^$COLOR*-/ && - $window[2] =~ /^$COLOR*\+/) { - print shift @window; - show_hunk(shift @window, shift @window); -} - -# And then flush any remaining lines. -while (@window) { - print shift @window; -} +# Flush any queued hunk (this can happen when there is no trailing context in +# the final diff of the input). +show_hunk(\@removed, \@added); exit 0; sub show_hunk { my ($a, $b) = @_; - print highlight_pair($a, $b); + # If one side is empty, then there is nothing to compare or highlight. + if (!@$a || !@$b) { + print @$a, @$b; + return; + } + + # If we have mismatched numbers of lines on each side, we could try to + # be clever and match up similar lines. But for now we are simple and + # stupid, and only handle multi-line hunks that remove and add the same + # number of lines. + if (@$a != @$b) { + print @$a, @$b; + return; + } + + my @queue; + for (my $i = 0; $i < @$a; $i++) { + my ($rm, $add) = highlight_pair($a->[$i], $b->[$i]); + print $rm; + push @queue, $add; + } + print @queue; } sub highlight_pair { From a0b676aaee29446388cd57fc555a740f9d26eea3 Mon Sep 17 00:00:00 2001 From: Jeff King Date: Mon, 13 Feb 2012 17:37:33 -0500 Subject: [PATCH 5/5] diff-highlight: document some non-optimal cases The diff-highlight script works on heuristics, so it can be wrong. Let's document some of the wrong-ness in case somebody feels like working on it. Signed-off-by: Jeff King Signed-off-by: Junio C Hamano --- contrib/diff-highlight/README | 93 +++++++++++++++++++++++++++++++++++ 1 file changed, 93 insertions(+) diff --git a/contrib/diff-highlight/README b/contrib/diff-highlight/README index 4a58579779..502e03b305 100644 --- a/contrib/diff-highlight/README +++ b/contrib/diff-highlight/README @@ -57,3 +57,96 @@ following in your git configuration: show = diff-highlight | less diff = diff-highlight | less --------------------------------------------- + +Bugs +---- + +Because diff-highlight relies on heuristics to guess which parts of +changes are important, there are some cases where the highlighting is +more distracting than useful. Fortunately, these cases are rare in +practice, and when they do occur, the worst case is simply a little +extra highlighting. This section documents some cases known to be +sub-optimal, in case somebody feels like working on improving the +heuristics. + +1. Two changes on the same line get highlighted in a blob. For example, + highlighting: + +---------------------------------------------- +-foo(buf, size); ++foo(obj->buf, obj->size); +---------------------------------------------- + + yields (where the inside of "+{}" would be highlighted): + +---------------------------------------------- +-foo(buf, size); ++foo(+{obj->buf, obj->}size); +---------------------------------------------- + + whereas a more semantically meaningful output would be: + +---------------------------------------------- +-foo(buf, size); ++foo(+{obj->}buf, +{obj->}size); +---------------------------------------------- + + Note that doing this right would probably involve a set of + content-specific boundary patterns, similar to word-diff. Otherwise + you get junk like: + +----------------------------------------------------- +-this line has some -{i}nt-{ere}sti-{ng} text on it ++this line has some +{fa}nt+{a}sti+{c} text on it +----------------------------------------------------- + + which is less readable than the current output. + +2. The multi-line matching assumes that lines in the pre- and post-image + match by position. This is often the case, but can be fooled when a + line is removed from the top and a new one added at the bottom (or + vice versa). Unless the lines in the middle are also changed, diffs + will show this as two hunks, and it will not get highlighted at all + (which is good). But if the lines in the middle are changed, the + highlighting can be misleading. Here's a pathological case: + +----------------------------------------------------- +-one +-two +-three +-four ++two 2 ++three 3 ++four 4 ++five 5 +----------------------------------------------------- + + which gets highlighted as: + +----------------------------------------------------- +-one +-t-{wo} +-three +-f-{our} ++two 2 ++t+{hree 3} ++four 4 ++f+{ive 5} +----------------------------------------------------- + + because it matches "two" to "three 3", and so forth. It would be + nicer as: + +----------------------------------------------------- +-one +-two +-three +-four ++two +{2} ++three +{3} ++four +{4} ++five 5 +----------------------------------------------------- + + which would probably involve pre-matching the lines into pairs + according to some heuristic.