From 2b21008d3c43d41588e84b45907bb8c2e86278f6 Mon Sep 17 00:00:00 2001
From: Jeff King <peff@peff.net>
Date: Mon, 13 Feb 2012 17:28:10 -0500
Subject: [PATCH 1/5] diff-highlight: make perl strict and warnings fatal

These perl features can catch bugs, and we shouldn't be
violating any of the strict rules or creating any warnings,
so let's turn them on.

Signed-off-by: Jeff King <peff@peff.net>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
---
 contrib/diff-highlight/diff-highlight | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/contrib/diff-highlight/diff-highlight b/contrib/diff-highlight/diff-highlight
index d8938982e4..c3302dd817 100755
--- a/contrib/diff-highlight/diff-highlight
+++ b/contrib/diff-highlight/diff-highlight
@@ -1,5 +1,8 @@
 #!/usr/bin/perl
 
+use warnings FATAL => 'all';
+use strict;
+
 # Highlight by reversing foreground and background. You could do
 # other things like bold or underline if you prefer.
 my $HIGHLIGHT   = "\x1b[7m";

From 097128d1bce7194702f336e30c5e228aa8ba774f Mon Sep 17 00:00:00 2001
From: Jeff King <peff@peff.net>
Date: Mon, 13 Feb 2012 17:32:47 -0500
Subject: [PATCH 2/5] diff-highlight: don't highlight whole lines
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

If you have a change like:

  -foo
  +bar

we end up highlighting the entirety of both lines (since the
whole thing is changed). But the point of diff highlighting
is to pinpoint the specific change in a pair of lines that
are mostly identical. In this case, the highlighting is just
noise, since there is nothing to pinpoint, and we are better
off doing nothing.

The implementation looks for "interesting" pairs by checking
to see whether they actually have a matching prefix or
suffix that does not simply consist of colorization and
whitespace.  However, the implementation makes it easy to
plug in other heuristics, too, like:

  1. Depending on the source material, the set of "boring"
     characters could be tweaked to include language-specific
     stuff (like braces or semicolons for C).

  2. Instead of saying "an interesting line has at least one
     character of prefix or suffix", we could require that
     less than N percent of the line be highlighted.

The simple "ignore whitespace, and highlight if there are
any matched characters" implemented by this patch seems to
give good results on git.git. I'll leave experimentation
with other heuristics to somebody who has a dataset that
does not look good with the current code.

Based on an original idea and implementation by Michał
Kiedrowicz.

Signed-off-by: Jeff King <peff@peff.net>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
---
 contrib/diff-highlight/diff-highlight | 28 +++++++++++++++++++++++++--
 1 file changed, 26 insertions(+), 2 deletions(-)

diff --git a/contrib/diff-highlight/diff-highlight b/contrib/diff-highlight/diff-highlight
index c3302dd817..0d8df84a20 100755
--- a/contrib/diff-highlight/diff-highlight
+++ b/contrib/diff-highlight/diff-highlight
@@ -8,6 +8,7 @@ use strict;
 my $HIGHLIGHT   = "\x1b[7m";
 my $UNHIGHLIGHT = "\x1b[27m";
 my $COLOR = qr/\x1b\[[0-9;]*m/;
+my $BORING = qr/$COLOR|\s/;
 
 my @window;
 
@@ -104,8 +105,14 @@ sub show_pair {
 		}
 	}
 
-	print highlight(\@a, $pa, $sa);
-	print highlight(\@b, $pb, $sb);
+	if (is_pair_interesting(\@a, $pa, $sa, \@b, $pb, $sb)) {
+		print highlight(\@a, $pa, $sa);
+		print highlight(\@b, $pb, $sb);
+	}
+	else {
+		print join('', @a);
+		print join('', @b);
+	}
 }
 
 sub split_line {
@@ -125,3 +132,20 @@ sub highlight {
 		@{$line}[($suffix+1)..$#$line]
 	);
 }
+
+# Pairs are interesting to highlight only if we are going to end up
+# highlighting a subset (i.e., not the whole line). Otherwise, the highlighting
+# is just useless noise. We can detect this by finding either a matching prefix
+# or suffix (disregarding boring bits like whitespace and colorization).
+sub is_pair_interesting {
+	my ($a, $pa, $sa, $b, $pb, $sb) = @_;
+	my $prefix_a = join('', @$a[0..($pa-1)]);
+	my $prefix_b = join('', @$b[0..($pb-1)]);
+	my $suffix_a = join('', @$a[($sa+1)..$#$a]);
+	my $suffix_b = join('', @$b[($sb+1)..$#$b]);
+
+	return $prefix_a !~ /^$COLOR*-$BORING*$/ ||
+	       $prefix_b !~ /^$COLOR*\+$BORING*$/ ||
+	       $suffix_a !~ /^$BORING*$/ ||
+	       $suffix_b !~ /^$BORING*$/;
+}

From 6463fd7ed189f4100e1102062f29b969384f1436 Mon Sep 17 00:00:00 2001
From: Jeff King <peff@peff.net>
Date: Mon, 13 Feb 2012 17:33:10 -0500
Subject: [PATCH 3/5] diff-highlight: refactor to prepare for multi-line hunks

The current code structure assumes that we will only look at
a pair of lines at any given time, and that the end result
should always be to output that pair. However, we want to
eventually handle multi-line hunks, which will involve
collating pairs of removed/added lines. Let's refactor the
code to return highlighted pairs instead of printing them.

Signed-off-by: Jeff King <peff@peff.net>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
---
 contrib/diff-highlight/diff-highlight | 22 ++++++++++++++--------
 1 file changed, 14 insertions(+), 8 deletions(-)

diff --git a/contrib/diff-highlight/diff-highlight b/contrib/diff-highlight/diff-highlight
index 0d8df84a20..279d21181e 100755
--- a/contrib/diff-highlight/diff-highlight
+++ b/contrib/diff-highlight/diff-highlight
@@ -23,7 +23,7 @@ while (<>) {
 	    $window[2] =~ /^$COLOR*\+/ &&
 	    $window[3] !~ /^$COLOR*\+/) {
 		print shift @window;
-		show_pair(shift @window, shift @window);
+		show_hunk(shift @window, shift @window);
 	}
 	else {
 		print shift @window;
@@ -48,7 +48,7 @@ if (@window == 3 &&
     $window[1] =~ /^$COLOR*-/ &&
     $window[2] =~ /^$COLOR*\+/) {
 	print shift @window;
-	show_pair(shift @window, shift @window);
+	show_hunk(shift @window, shift @window);
 }
 
 # And then flush any remaining lines.
@@ -58,7 +58,13 @@ while (@window) {
 
 exit 0;
 
-sub show_pair {
+sub show_hunk {
+	my ($a, $b) = @_;
+
+	print highlight_pair($a, $b);
+}
+
+sub highlight_pair {
 	my @a = split_line(shift);
 	my @b = split_line(shift);
 
@@ -106,12 +112,12 @@ sub show_pair {
 	}
 
 	if (is_pair_interesting(\@a, $pa, $sa, \@b, $pb, $sb)) {
-		print highlight(\@a, $pa, $sa);
-		print highlight(\@b, $pb, $sb);
+		return highlight_line(\@a, $pa, $sa),
+		       highlight_line(\@b, $pb, $sb);
 	}
 	else {
-		print join('', @a);
-		print join('', @b);
+		return join('', @a),
+		       join('', @b);
 	}
 }
 
@@ -121,7 +127,7 @@ sub split_line {
 	       split /($COLOR*)/;
 }
 
-sub highlight {
+sub highlight_line {
 	my ($line, $prefix, $suffix) = @_;
 
 	return join('',

From 34d9819e0a387be6d49cffe67458036450d6d0d5 Mon Sep 17 00:00:00 2001
From: Jeff King <peff@peff.net>
Date: Mon, 13 Feb 2012 17:36:36 -0500
Subject: [PATCH 4/5] diff-highlight: match multi-line hunks
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Currently we only bother highlighting single-line hunks. The
rationale was that the purpose of highlighting is to point
out small changes between two similar lines that are
otherwise hard to see. However, that meant we missed similar
cases where two lines were changed together, like:

   -foo(buf);
   -bar(buf);
   +foo(obj->buf);
   +bar(obj->buf);

Each of those changes is simple, and would benefit from
highlighting (the "obj->" parts in this case).

This patch considers whole hunks at a time. For now, we
consider only the case where the hunk has the same number of
removed and added lines, and assume that the lines from each
segment correspond one-to-one. While this is just a
heuristic, in practice it seems to generate sensible
results (especially because we now omit highlighting on
completely-changed lines, so when our heuristic is wrong, we
tend to avoid highlighting at all).

Based on an original idea and implementation by Michał
Kiedrowicz.

Signed-off-by: Jeff King <peff@peff.net>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
---
 contrib/diff-highlight/README         | 16 +++---
 contrib/diff-highlight/diff-highlight | 70 ++++++++++++++++-----------
 2 files changed, 52 insertions(+), 34 deletions(-)

diff --git a/contrib/diff-highlight/README b/contrib/diff-highlight/README
index 1b7b6df8eb..4a58579779 100644
--- a/contrib/diff-highlight/README
+++ b/contrib/diff-highlight/README
@@ -14,13 +14,15 @@ Instead, this script post-processes the line-oriented diff, finds pairs
 of lines, and highlights the differing segments.  It's currently very
 simple and stupid about doing these tasks. In particular:
 
-  1. It will only highlight a pair of lines if they are the only two
-     lines in a hunk.  It could instead try to match up "before" and
-     "after" lines for a given hunk into pairs of similar lines.
-     However, this may end up visually distracting, as the paired
-     lines would have other highlighted lines in between them. And in
-     practice, the lines which most need attention called to their
-     small, hard-to-see changes are touching only a single line.
+  1. It will only highlight hunks in which the number of removed and
+     added lines is the same, and it will pair lines within the hunk by
+     position (so the first removed line is compared to the first added
+     line, and so forth). This is simple and tends to work well in
+     practice. More complex changes don't highlight well, so we tend to
+     exclude them due to the "same number of removed and added lines"
+     restriction. Or even if we do try to highlight them, they end up
+     not highlighting because of our "don't highlight if the whole line
+     would be highlighted" rule.
 
   2. It will find the common prefix and suffix of two lines, and
      consider everything in the middle to be "different". It could
diff --git a/contrib/diff-highlight/diff-highlight b/contrib/diff-highlight/diff-highlight
index 279d21181e..c4404d49c9 100755
--- a/contrib/diff-highlight/diff-highlight
+++ b/contrib/diff-highlight/diff-highlight
@@ -10,23 +10,28 @@ my $UNHIGHLIGHT = "\x1b[27m";
 my $COLOR = qr/\x1b\[[0-9;]*m/;
 my $BORING = qr/$COLOR|\s/;
 
-my @window;
+my @removed;
+my @added;
+my $in_hunk;
 
 while (<>) {
-	# We highlight only single-line changes, so we need
-	# a 4-line window to make a decision on whether
-	# to highlight.
-	push @window, $_;
-	next if @window < 4;
-	if ($window[0] =~ /^$COLOR*(\@| )/ &&
-	    $window[1] =~ /^$COLOR*-/ &&
-	    $window[2] =~ /^$COLOR*\+/ &&
-	    $window[3] !~ /^$COLOR*\+/) {
-		print shift @window;
-		show_hunk(shift @window, shift @window);
+	if (!$in_hunk) {
+		print;
+		$in_hunk = /^$COLOR*\@/;
+	}
+	elsif (/^$COLOR*-/) {
+		push @removed, $_;
+	}
+	elsif (/^$COLOR*\+/) {
+		push @added, $_;
 	}
 	else {
-		print shift @window;
+		show_hunk(\@removed, \@added);
+		@removed = ();
+		@added = ();
+
+		print;
+		$in_hunk = /^$COLOR*[\@ ]/;
 	}
 
 	# Most of the time there is enough output to keep things streaming,
@@ -42,26 +47,37 @@ while (<>) {
 	}
 }
 
-# Special case a single-line hunk at the end of file.
-if (@window == 3 &&
-    $window[0] =~ /^$COLOR*(\@| )/ &&
-    $window[1] =~ /^$COLOR*-/ &&
-    $window[2] =~ /^$COLOR*\+/) {
-	print shift @window;
-	show_hunk(shift @window, shift @window);
-}
-
-# And then flush any remaining lines.
-while (@window) {
-	print shift @window;
-}
+# Flush any queued hunk (this can happen when there is no trailing context in
+# the final diff of the input).
+show_hunk(\@removed, \@added);
 
 exit 0;
 
 sub show_hunk {
 	my ($a, $b) = @_;
 
-	print highlight_pair($a, $b);
+	# If one side is empty, then there is nothing to compare or highlight.
+	if (!@$a || !@$b) {
+		print @$a, @$b;
+		return;
+	}
+
+	# If we have mismatched numbers of lines on each side, we could try to
+	# be clever and match up similar lines. But for now we are simple and
+	# stupid, and only handle multi-line hunks that remove and add the same
+	# number of lines.
+	if (@$a != @$b) {
+		print @$a, @$b;
+		return;
+	}
+
+	my @queue;
+	for (my $i = 0; $i < @$a; $i++) {
+		my ($rm, $add) = highlight_pair($a->[$i], $b->[$i]);
+		print $rm;
+		push @queue, $add;
+	}
+	print @queue;
 }
 
 sub highlight_pair {

From a0b676aaee29446388cd57fc555a740f9d26eea3 Mon Sep 17 00:00:00 2001
From: Jeff King <peff@peff.net>
Date: Mon, 13 Feb 2012 17:37:33 -0500
Subject: [PATCH 5/5] diff-highlight: document some non-optimal cases

The diff-highlight script works on heuristics, so it can be
wrong. Let's document some of the wrong-ness in case
somebody feels like working on it.

Signed-off-by: Jeff King <peff@peff.net>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
---
 contrib/diff-highlight/README | 93 +++++++++++++++++++++++++++++++++++
 1 file changed, 93 insertions(+)

diff --git a/contrib/diff-highlight/README b/contrib/diff-highlight/README
index 4a58579779..502e03b305 100644
--- a/contrib/diff-highlight/README
+++ b/contrib/diff-highlight/README
@@ -57,3 +57,96 @@ following in your git configuration:
 	show = diff-highlight | less
 	diff = diff-highlight | less
 ---------------------------------------------
+
+Bugs
+----
+
+Because diff-highlight relies on heuristics to guess which parts of
+changes are important, there are some cases where the highlighting is
+more distracting than useful. Fortunately, these cases are rare in
+practice, and when they do occur, the worst case is simply a little
+extra highlighting. This section documents some cases known to be
+sub-optimal, in case somebody feels like working on improving the
+heuristics.
+
+1. Two changes on the same line get highlighted in a blob. For example,
+   highlighting:
+
+----------------------------------------------
+-foo(buf, size);
++foo(obj->buf, obj->size);
+----------------------------------------------
+
+   yields (where the inside of "+{}" would be highlighted):
+
+----------------------------------------------
+-foo(buf, size);
++foo(+{obj->buf, obj->}size);
+----------------------------------------------
+
+   whereas a more semantically meaningful output would be:
+
+----------------------------------------------
+-foo(buf, size);
++foo(+{obj->}buf, +{obj->}size);
+----------------------------------------------
+
+   Note that doing this right would probably involve a set of
+   content-specific boundary patterns, similar to word-diff. Otherwise
+   you get junk like:
+
+-----------------------------------------------------
+-this line has some -{i}nt-{ere}sti-{ng} text on it
++this line has some +{fa}nt+{a}sti+{c} text on it
+-----------------------------------------------------
+
+   which is less readable than the current output.
+
+2. The multi-line matching assumes that lines in the pre- and post-image
+   match by position. This is often the case, but can be fooled when a
+   line is removed from the top and a new one added at the bottom (or
+   vice versa). Unless the lines in the middle are also changed, diffs
+   will show this as two hunks, and it will not get highlighted at all
+   (which is good). But if the lines in the middle are changed, the
+   highlighting can be misleading. Here's a pathological case:
+
+-----------------------------------------------------
+-one
+-two
+-three
+-four
++two 2
++three 3
++four 4
++five 5
+-----------------------------------------------------
+
+   which gets highlighted as:
+
+-----------------------------------------------------
+-one
+-t-{wo}
+-three
+-f-{our}
++two 2
++t+{hree 3}
++four 4
++f+{ive 5}
+-----------------------------------------------------
+
+   because it matches "two" to "three 3", and so forth. It would be
+   nicer as:
+
+-----------------------------------------------------
+-one
+-two
+-three
+-four
++two +{2}
++three +{3}
++four +{4}
++five 5
+-----------------------------------------------------
+
+   which would probably involve pre-matching the lines into pairs
+   according to some heuristic.