From mboxrd@z Thu Jan 1 00:00:00 1970 From: Jeff King Subject: Re: [PATCH] Improve contrib/diff-highlight to highlight unevenly-sized hunks Date: Thu, 18 Jun 2015 17:23:56 -0400 Message-ID: <20150618212356.GA20271@peff.net> References: <1434388853-23915-1-git-send-email-patrick@parcs.ath.cx> <20150618190417.GA12769@peff.net> <20150618204505.GD14550@peff.net> Mime-Version: 1.0 Content-Type: text/plain; charset=utf-8 Cc: Junio C Hamano , git@vger.kernel.org To: Patrick Palka X-From: git-owner@vger.kernel.org Thu Jun 18 23:24:26 2015 Return-path: Envelope-to: gcvg-git-2@plane.gmane.org Received: from vger.kernel.org ([209.132.180.67]) by plane.gmane.org with esmtp (Exim 4.69) (envelope-from ) id 1Z5hID-0003ut-A8 for gcvg-git-2@plane.gmane.org; Thu, 18 Jun 2015 23:24:13 +0200 Received: (majordomo@vger.kernel.org) by vger.kernel.org via listexpand id S1752315AbbFRVYD (ORCPT ); Thu, 18 Jun 2015 17:24:03 -0400 Received: from cloud.peff.net ([50.56.180.127]:48492 "HELO cloud.peff.net" rhost-flags-OK-OK-OK-OK) by vger.kernel.org with SMTP id S1752012AbbFRVYA (ORCPT ); Thu, 18 Jun 2015 17:24:00 -0400 Received: (qmail 27966 invoked by uid 102); 18 Jun 2015 21:23:59 -0000 Received: from Unknown (HELO peff.net) (10.0.1.1) by cloud.peff.net (qpsmtpd/0.84) with SMTP; Thu, 18 Jun 2015 16:23:59 -0500 Received: (qmail 8952 invoked by uid 107); 18 Jun 2015 21:23:58 -0000 Received: from sigill.intra.peff.net (HELO sigill.intra.peff.net) (10.0.0.7) by peff.net (qpsmtpd/0.84) with SMTP; Thu, 18 Jun 2015 17:23:58 -0400 Received: by sigill.intra.peff.net (sSMTP sendmail emulation); Thu, 18 Jun 2015 17:23:56 -0400 Content-Disposition: inline In-Reply-To: <20150618204505.GD14550@peff.net> Sender: git-owner@vger.kernel.org Precedence: bulk List-ID: X-Mailing-List: git@vger.kernel.org Archived-At: On Thu, Jun 18, 2015 at 04:45:05PM -0400, Jeff King wrote: > Still, I think this is probably a minority case, and it may be > outweighed by the improvements. The "real" solution is to consider the > hunk as a whole and do an LCS diff on it, which would show that yes, > it's worth highlighting both of those spots, as they are a small > percentage of the total hunk. I've been meaning to play with this for years, so I took the opportunity to spend a little time on it. :) Below is a (slightly hacky) patch I came up with. It seems to work, and produces really great output in some cases. For instance, in 99a2cfb, it produces (I put highlighted bits in angle brackets): - cpy(peeled, ); + cpy(<&>peeled, ); It also produces nonsense like: - sed peeled<[20]>; + sed peeled; but I think that is simply because my splitting function is terrible (it splits each byte, whereas we'd probably want to use whitespace and punctuation, or something content-specific). --- diff --git a/contrib/diff-highlight/diff-highlight b/contrib/diff-highlight/diff-highlight index ffefc31..7165518 100755 --- a/contrib/diff-highlight/diff-highlight +++ b/contrib/diff-highlight/diff-highlight @@ -3,6 +3,7 @@ use 5.008; use warnings FATAL => 'all'; use strict; +use Algorithm::Diff; # Highlight by reversing foreground and background. You could do # other things like bold or underline if you prefer. @@ -88,131 +89,54 @@ sub show_hunk { return; } - # If we have mismatched numbers of lines on each side, we could try to - # be clever and match up similar lines. But for now we are simple and - # stupid, and only handle multi-line hunks that remove and add the same - # number of lines. - if (@$a != @$b) { - print @$a, @$b; - return; - } - - my @queue; - for (my $i = 0; $i < @$a; $i++) { - my ($rm, $add) = highlight_pair($a->[$i], $b->[$i]); - print $rm; - push @queue, $add; - } - print @queue; -} - -sub highlight_pair { - my @a = split_line(shift); - my @b = split_line(shift); + my ($prefix_a, $suffix_a, @hunk_a) = split_hunk(@$a); + my ($prefix_b, $suffix_b, @hunk_b) = split_hunk(@$b); - # Find common prefix, taking care to skip any ansi - # color codes. - my $seen_plusminus; - my ($pa, $pb) = (0, 0); - while ($pa < @a && $pb < @b) { - if ($a[$pa] =~ /$COLOR/) { - $pa++; - } - elsif ($b[$pb] =~ /$COLOR/) { - $pb++; - } - elsif ($a[$pa] eq $b[$pb]) { - $pa++; - $pb++; - } - elsif (!$seen_plusminus && $a[$pa] eq '-' && $b[$pb] eq '+') { - $seen_plusminus = 1; - $pa++; - $pb++; - } - else { - last; - } - } + my $diff = Algorithm::Diff->new(\@hunk_a, \@hunk_b); + my (@out_a, @out_b); + while ($diff->Next()) { + my $bits = $diff->Diff(); - # Find common suffix, ignoring colors. - my ($sa, $sb) = ($#a, $#b); - while ($sa >= $pa && $sb >= $pb) { - if ($a[$sa] =~ /$COLOR/) { - $sa--; - } - elsif ($b[$sb] =~ /$COLOR/) { - $sb--; - } - elsif ($a[$sa] eq $b[$sb]) { - $sa--; - $sb--; - } - else { - last; - } - } + push @out_a, $OLD_HIGHLIGHT[1] if $bits & 1; + push @out_a, $diff->Items(1); + push @out_a, $OLD_HIGHLIGHT[2] if $bits & 1; - if (is_pair_interesting(\@a, $pa, $sa, \@b, $pb, $sb)) { - return highlight_line(\@a, $pa, $sa, \@OLD_HIGHLIGHT), - highlight_line(\@b, $pb, $sb, \@NEW_HIGHLIGHT); - } - else { - return join('', @a), - join('', @b); + push @out_b, $NEW_HIGHLIGHT[1] if $bits & 2; + push @out_b, $diff->Items(2); + push @out_b, $NEW_HIGHLIGHT[2] if $bits & 2; } -} -sub split_line { - local $_ = shift; - return utf8::decode($_) ? - map { utf8::encode($_); $_ } - map { /$COLOR/ ? $_ : (split //) } - split /($COLOR+)/ : - map { /$COLOR/ ? $_ : (split //) } - split /($COLOR+)/; + output_split_hunk($prefix_a, $suffix_a, @out_a); + output_split_hunk($prefix_b, $suffix_b, @out_b); } -sub highlight_line { - my ($line, $prefix, $suffix, $theme) = @_; - - my $start = join('', @{$line}[0..($prefix-1)]); - my $mid = join('', @{$line}[$prefix..$suffix]); - my $end = join('', @{$line}[($suffix+1)..$#$line]); - - # If we have a "normal" color specified, then take over the whole line. - # Otherwise, we try to just manipulate the highlighted bits. - if (defined $theme->[0]) { - s/$COLOR//g for ($start, $mid, $end); - chomp $end; - return join('', - $theme->[0], $start, $RESET, - $theme->[1], $mid, $RESET, - $theme->[0], $end, $RESET, - "\n" - ); - } else { - return join('', - $start, - $theme->[1], $mid, $theme->[2], - $end - ); +# Return the individual diff-able items of the hunk, but with any +# diff or color prefix/suffix for each line split out (we assume that the +# prefix/suffix for each line will be the same). +sub split_hunk { + my ($prefix, $suffix, @r); + foreach my $line (@_) { + $line =~ /^($COLOR*[+-]$COLOR*)(.*)($COLOR*)/ + or die "eh, this is supposed to match everything!"; + + # overwrite the old values; we assume they're all the same + # anyway + $prefix = $1; + $suffix = $3; + + # do a straight character split. This almost certainly isn't + # ideal, but it's a good starting point. We should at the very + # least be utf8-aware, and probably use color-words regexes. + push @r, split(//, $2), "\n"; } + return ($prefix, $suffix, @r); } -# Pairs are interesting to highlight only if we are going to end up -# highlighting a subset (i.e., not the whole line). Otherwise, the highlighting -# is just useless noise. We can detect this by finding either a matching prefix -# or suffix (disregarding boring bits like whitespace and colorization). -sub is_pair_interesting { - my ($a, $pa, $sa, $b, $pb, $sb) = @_; - my $prefix_a = join('', @$a[0..($pa-1)]); - my $prefix_b = join('', @$b[0..($pb-1)]); - my $suffix_a = join('', @$a[($sa+1)..$#$a]); - my $suffix_b = join('', @$b[($sb+1)..$#$b]); - - return $prefix_a !~ /^$COLOR*-$BORING*$/ || - $prefix_b !~ /^$COLOR*\+$BORING*$/ || - $suffix_a !~ /^$BORING*$/ || - $suffix_b !~ /^$BORING*$/; +sub output_split_hunk { + my $prefix = shift; + my $suffix = shift; + my $str = join('', @_); + $str =~ s/^/$prefix/mg; + $str =~ s/$/$suffix/mg; + print $str; }