Algorithm-Diff-Fast

# Inserted block for differenceing 
# use Algorithm::Diff qw(traverse_sequences);
# in standard version
# The following BEGIN block contains a verbatim copy of
# Ned Konz' Algorithm::Diff package version 1.15 except
# that subroutine _longestCommonSubsequence has been replace by 
# a routine which internally uses the UNIX diff command for
# the differencing rather than the Perl routines if the 
# length of the sequences exceeds some threshold.
# Also, all POD documentation has been stripped out.
#
# (the distribution on which this modification is based is available
#  from http://search.cpan.org/~nedkonz/Algorithm-Diff-1.15
#  the most recent version can be found via  http://search.cpan.org/search?module=Algorithm::Diff )
# Please note the LICENCE for Algorithm::Diff :
#   "Copyright (c) 2000-2002 Ned Konz.  All rights reserved.
#    This program is free software;
#    you can redistribute it and/or modify it under the same terms
#    as Perl itself."
# The fast-differencing version of latexdiff is provided as a convenience
# for latex users under Unix-like systems which have a 'diff' command.
# If you believe
# the inlining of Algorithm::Diff violates its license please contact
# me and I will modify the latexdiff distribution accordingly.
# Frederik Tilmann (tilmann@esc.cam.ac.uk)
# Jonathan Paisley is acknowledged for the idea of using the system diff
# command to achieve shorter running times
BEGIN { 
package Algorithm::Diff;
use strict;
use vars qw($VERSION @EXPORT_OK @ISA @EXPORT);
use integer;    # see below in _replaceNextLargerWith() for mod to make
                # if you don't use this
require Exporter;
@ISA       = qw(Exporter);
@EXPORT    = qw();
@EXPORT_OK = qw(LCS diff traverse_sequences traverse_balanced sdiff);
$VERSION = sprintf('%d.%02d fast', (q$Revision: 1.15 $ =~ /\d+/g));

# Global parameters

use File::Temp qw/tempfile/;
# if larger number of elements in longestCommonSubsequence smaller than
# this number, then use internal algorithm, otherwise use UNIX diff
use constant THRESHOLD => 100 ; 
# Detect whether diff --minimal option is available
# if yes we use it
use constant MINIMAL => ( system('diff','--minimal','/dev/null','/dev/null') >> 8 ==0 ? "--minimal" : "" ) ;


# McIlroy-Hunt diff algorithm
# Adapted from the Smalltalk code of Mario I. Wolczko, <mario@wolczko.com>
# by Ned Konz, perl@bike-nomad.com


# Create a hash that maps each element of $aCollection to the set of positions
# it occupies in $aCollection, restricted to the elements within the range of
# indexes specified by $start and $end.
# The fourth parameter is a subroutine reference that will be called to
# generate a string to use as a key.
# Additional parameters, if any, will be passed to this subroutine.
#
# my $hashRef = _withPositionsOfInInterval( \@array, $start, $end, $keyGen );

sub _withPositionsOfInInterval
{
	my $aCollection = shift;    # array ref
	my $start       = shift;
	my $end         = shift;
	my $keyGen      = shift;
	my %d;
	my $index;
	for ( $index = $start ; $index <= $end ; $index++ )
	{
		my $element = $aCollection->[$index];
		my $key = &$keyGen( $element, @_ );
		if ( exists( $d{$key} ) )
		{
			unshift ( @{ $d{$key} }, $index );
		}
		else
		{
			$d{$key} = [$index];
		}
	}
	return wantarray ? %d : \%d;
}

# Find the place at which aValue would normally be inserted into the array. If
# that place is already occupied by aValue, do nothing, and return undef. If
# the place does not exist (i.e., it is off the end of the array), add it to
# the end, otherwise replace the element at that point with aValue.
# It is assumed that the array's values are numeric.
# This is where the bulk (75%) of the time is spent in this module, so try to
# make it fast!

sub _replaceNextLargerWith
{
	my ( $array, $aValue, $high ) = @_;
	$high ||= $#$array;

	# off the end?
	if ( $high == -1 || $aValue > $array->[-1] )
	{
		push ( @$array, $aValue );
		return $high + 1;
	}

	# binary search for insertion point...
	my $low = 0;
	my $index;
	my $found;
	while ( $low <= $high )
	{
		$index = ( $high + $low ) / 2;

		#		$index = int(( $high + $low ) / 2);		# without 'use integer'
		$found = $array->[$index];

		if ( $aValue == $found )
		{
			return undef;
		}
		elsif ( $aValue > $found )
		{
			$low = $index + 1;
		}
		else
		{
			$high = $index - 1;
		}
	}

	# now insertion point is in $low.
	$array->[$low] = $aValue;    # overwrite next larger
	return $low;
}

# This method computes the longest common subsequence in $a and $b.

# Result is array or ref, whose contents is such that
# 	$a->[ $i ] == $b->[ $result[ $i ] ]
# foreach $i in ( 0 .. $#result ) if $result[ $i ] is defined.

# An additional argument may be passed; this is a hash or key generating
# function that should return a string that uniquely identifies the given
# element.  It should be the case that if the key is the same, the elements
# will compare the same. If this parameter is undef or missing, the key
# will be the element as a string.

# By default, comparisons will use "eq" and elements will be turned into keys
# using the default stringizing operator '""'.

# Additional parameters, if any, will be passed to the key generation routine.

sub _longestCommonSubsequence
{
	my $a      = shift;    # array ref
	my $b      = shift;    # array ref
	my $keyGen = shift;    # code ref
	my $compare;           # code ref

	# set up code refs
	# Note that these are optimized.
	if ( !defined($keyGen) )    # optimize for strings
	{
		$keyGen = sub { $_[0] };
		$compare = sub { my ( $a, $b ) = @_; $a eq $b };
	}
	else
	{
		$compare = sub {
			my $a = shift;
			my $b = shift;
			&$keyGen( $a, @_ ) eq &$keyGen( $b, @_ );
		};
	}

	my ( $aStart, $aFinish, $bStart, $bFinish, $matchVector ) =
	  ( 0, $#$a, 0, $#$b, [] );

	# Check whether to use internal routine (small number of elements)
	# or use it as a wrapper for UNIX diff
	if ( ( $#$a > $#$b ?  $#$a : $#$b) < THRESHOLD ) {
	  ###	  print STDERR "DEBUG: regular longestCommonSubsequence\n";
	  # First we prune off any common elements at the beginning
	  while ( $aStart <= $aFinish
		  and $bStart <= $bFinish
		  and &$compare( $a->[$aStart], $b->[$bStart], @_ ) )
	    {
	      $matchVector->[ $aStart++ ] = $bStart++;
	    }

	  # now the end
	  while ( $aStart <= $aFinish
		and $bStart <= $bFinish
		and &$compare( $a->[$aFinish], $b->[$bFinish], @_ ) )
	    {
	      $matchVector->[ $aFinish-- ] = $bFinish--;
	    }

	  # Now compute the equivalence classes of positions of elements
	  my $bMatches =
	    _withPositionsOfInInterval( $b, $bStart, $bFinish, $keyGen, @_ );
	  my $thresh = [];
	  my $links  = [];
	  
	  my ( $i, $ai, $j, $k );
	  for ( $i = $aStart ; $i <= $aFinish ; $i++ )
	    {
	      $ai = &$keyGen( $a->[$i], @_ );
	      if ( exists( $bMatches->{$ai} ) )
		{
		  $k = 0;
		  for $j ( @{ $bMatches->{$ai} } )
		    {
		      
		      # optimization: most of the time this will be true
		      if ( $k and $thresh->[$k] > $j and $thresh->[ $k - 1 ] < $j )
			{
			  $thresh->[$k] = $j;
			}
		      else
			{
			  $k = _replaceNextLargerWith( $thresh, $j, $k );
			}
		      
		      # oddly, it's faster to always test this (CPU cache?).
		      if ( defined($k) )
			{
			  $links->[$k] =
			    [ ( $k ? $links->[ $k - 1 ] : undef ), $i, $j ];
			}
		    }
		}
	    }

	  if (@$thresh)
	    {
	      for ( my $link = $links->[$#$thresh] ; $link ; $link = $link->[0] )
		{
		  $matchVector->[ $link->[1] ] = $link->[2];
		}
	    }
	}
	else {
	  my ($fha,$fhb,$fna,$fnb,$ele,$key);
	  my ($alines,$blines,$alb,$alf,$blb,$blf);
	  my ($minimal)=MINIMAL;
	  # large number of elements, use system diff
	  ###	  print STDERR "DEBUG: fast (diff) longestCommonSubsequence\n";

	  ($fha,$fna)=tempfile("DiffA-XXXX") or die "_longestCommonSubsequence: Cannot open tempfile for sequence A";
	  ($fhb,$fnb)=tempfile("DiffB-XXXX") or die "_longestCommonSubsequence: Cannot open tempfile for sequence B";
	  # prepare sequence A
	  foreach $ele ( @$a ) {
	    $key=&$keyGen( $ele, @_ );
	    $key =~ s/\\/\\\\/g ;
	    $key =~ s/\n/\\n/sg ;
###	    if ($key =~ m/general/ ) {
###	      print STDERR "DEBUG: A Sequence |$key|";
###	    }
	    print $fha "$key\n" ;
	  }
	  close($fha);
	  # prepare sequence B
	  foreach $ele ( @$b ) {
	    $key=&$keyGen( $ele, @_ );
	    $key =~ s/\\/\\\\/g ;
	    $key =~ s/\n/\\n/sg ;
###	    if ($key =~ m/general/ ) {
###	      print STDERR "DEBUG: B Sequence |$key|";
###	    }
	    print $fhb "$key\n" ;
	  }
	  close($fhb);
###	  print STDERR "DEBUG: Sequence A, reprocessed in $fna:\n===============\n";
###	  system ("cat", $fna);
###	  print STDERR "DEBUG: ",MINIMAL,"============================\n";
	  
	  open(DIFFPIPE, "diff $minimal $fna $fnb |") or die "_longestCommonSubsequence: Cannot launch diff process. $!" ;
	  # The diff line numbering begins with 1, but Perl subscripts start with 0
	  # We follow the diff numbering but substract 1 when assigning to matchVector
	  $aStart++; $bStart++ ; $aFinish++ ; $bFinish++ ;
	  while( <DIFFPIPE> ) {
	    if ( ($alines,$blines) = ( m/^(\d*(?:,\d*)?)?c(\d*(?:,\d*)?)?$/ ) ) {
###	      print STDERR "DEBUG: Detected changed lines: range ${alines}c${blines}\n";
	      ($alb,$alf)=split(/,/,$alines);
	      ($blb,$blf)=split(/,/,$blines);
	      $alf=$alb unless defined($alf);
	      $blf=$blb unless defined($blf);
###	      print STDERR "DEBUG: $alb $alf $blb $blf\n";
	      while($aStart < $alb ) {
		$matchVector->[ -1 + $aStart++ ] = -1 + $bStart++ ;
	      }
	      # check for consistency
	      $bStart==$blb or die "_longestCommonSubsequence: Fatal error in interpreting diff output: Inconsistency in changed sequence";
	      $aStart=$alf+1;
	      $bStart=$blf+1;
	    }
	    elsif ( ($alb,$blines) = ( m/^(\d*)a(\d*(?:,\d*)?)$/ ) ) {
###	      print STDERR "DEBUG: Detected appended lines: range ${alb}a${blines}\n";
	      ($blb,$blf)=split(/,/,$blines);
	      $blf=$blb unless defined($blf);
	      while ( $bStart < $blb ) {
		$matchVector->[ -1 + $aStart++ ] = -1 + $bStart++ ;
	      }
	      $aStart==$alb+1 or die "_longestCommonSubsequence: Fatal error in interpreting diff output: Inconsistency in appended sequence near elements $aStart and $bStart";
	      $bStart=$blf+1;
	    }
	    elsif ( ($alines,$blb) = ( m/^(\d*(?:,\d*)?)d(\d*)$/ ) ) {
###	      print STDERR "DEBUG: Detected deleted lines: range ${alines}c${blb}\n";
	      ($alb,$alf)=split(/,/,$alines);
	      $alf=$alb unless defined($alf);
	      while ( $aStart < $alb ) {
		$matchVector->[ -1 + $aStart++ ] = -1 + $bStart++ ;
	      }
	      $bStart==$blb+1 or die "_longestCommonSubsequence: Fatal error in interpreting diff output: Inconsistency in deleted sequence near elements $aStart and $bStart";
	      $aStart=$alf+1;
	    }
	    elsif ( m/^Binary files/ ) {
	      # if diff reports it is a binary file force --text mode. I do not like
	      # to always use this option because it is probably only available in GNU diff
###	      print STDERR "DEBUG: Detected binary file. Relaunching diff with text option\n" ;
	      open(DIFFPIPE, "diff --text $fna $fnb |") or die "Cannot launch diff process. $!" ;
	    }
	    # Default: just skip line
	  }
	  while ($aStart <= $aFinish ) {
	    $matchVector->[ -1 + $aStart++ ] = -1 + $bStart++ ;
	  }
	  $bStart==$bFinish+1  or die "_longestCommonSubsequence: Fatal error in interpreting diff output: Inconsistency at end";
	  close DIFFPIPE;
	  # check whether a system error has occurred or return status is greater than or equal to 5
	  if ( $! || ($? >> 8) > 5) {
	    print STDERR "diff process failed with exit code ", ($? >> 8), " $!\n";
	    die;
	  }
###	  print STDERR "DEBUG TEMPFILES: $fna, $fnb\n";
	  unlink $fna,$fnb ;
	}
	return wantarray ? @$matchVector : $matchVector;
}

sub traverse_sequences
{
	my $a                 = shift;                                  # array ref
	my $b                 = shift;                                  # array ref
	my $callbacks         = shift || {};
	my $keyGen            = shift;
	my $matchCallback     = $callbacks->{'MATCH'} || sub { };
	my $discardACallback  = $callbacks->{'DISCARD_A'} || sub { };
	my $finishedACallback = $callbacks->{'A_FINISHED'};
	my $discardBCallback  = $callbacks->{'DISCARD_B'} || sub { };
	my $finishedBCallback = $callbacks->{'B_FINISHED'};
	my $matchVector = _longestCommonSubsequence( $a, $b, $keyGen, @_ );

	# Process all the lines in @$matchVector
	my $lastA = $#$a;
	my $lastB = $#$b;
	my $bi    = 0;
	my $ai;

	for ( $ai = 0 ; $ai <= $#$matchVector ; $ai++ )
	{
		my $bLine = $matchVector->[$ai];
		if ( defined($bLine) )    # matched
		{
			&$discardBCallback( $ai, $bi++, @_ ) while $bi < $bLine;
			&$matchCallback( $ai,    $bi++, @_ );
		}
		else
		{
			&$discardACallback( $ai, $bi, @_ );
		}
	}

	# The last entry (if any) processed was a match.
	# $ai and $bi point just past the last matching lines in their sequences.

	while ( $ai <= $lastA or $bi <= $lastB )
	{

		# last A?
		if ( $ai == $lastA + 1 and $bi <= $lastB )
		{
			if ( defined($finishedACallback) )
			{
				&$finishedACallback( $lastA, @_ );
				$finishedACallback = undef;
			}
			else
			{
				&$discardBCallback( $ai, $bi++, @_ ) while $bi <= $lastB;
			}
		}

		# last B?
		if ( $bi == $lastB + 1 and $ai <= $lastA )
		{
			if ( defined($finishedBCallback) )
			{
				&$finishedBCallback( $lastB, @_ );
				$finishedBCallback = undef;
			}
			else
			{
				&$discardACallback( $ai++, $bi, @_ ) while $ai <= $lastA;
			}
		}

		&$discardACallback( $ai++, $bi, @_ ) if $ai <= $lastA;
		&$discardBCallback( $ai, $bi++, @_ ) if $bi <= $lastB;
	}

	return 1;
}

sub traverse_balanced
{
	my $a                 = shift;                                  # array ref
	my $b                 = shift;                                  # array ref
	my $callbacks         = shift || {};
	my $keyGen            = shift;
	my $matchCallback     = $callbacks->{'MATCH'} || sub { };
	my $discardACallback  = $callbacks->{'DISCARD_A'} || sub { };
	my $discardBCallback  = $callbacks->{'DISCARD_B'} || sub { };
	my $changeCallback    = $callbacks->{'CHANGE'};
	my $matchVector = _longestCommonSubsequence( $a, $b, $keyGen, @_ );

	# Process all the lines in match vector
	my $lastA = $#$a;
	my $lastB = $#$b;
	my $bi    = 0;
	my $ai    = 0;
	my $ma    = -1;
	my $mb;

	while (1)
	{

		# Find next match indices $ma and $mb
		do { $ma++ } while ( $ma <= $#$matchVector && !defined $matchVector->[$ma] );

		last if $ma > $#$matchVector;    # end of matchVector?
		$mb = $matchVector->[$ma];

		# Proceed with discard a/b or change events until
		# next match
		while ( $ai < $ma || $bi < $mb )
		{

			if ( $ai < $ma && $bi < $mb )
			{

				# Change
				if ( defined $changeCallback )
				{
					&$changeCallback( $ai++, $bi++, @_ );
				}
				else
				{
					&$discardACallback( $ai++, $bi, @_ );
					&$discardBCallback( $ai, $bi++, @_ );
				}
			}
			elsif ( $ai < $ma )
			{
				&$discardACallback( $ai++, $bi, @_ );
			}
			else
			{

				# $bi < $mb
				&$discardBCallback( $ai, $bi++, @_ );
			}
		}

		# Match
		&$matchCallback( $ai++, $bi++, @_ );
	}

	while ( $ai <= $lastA || $bi <= $lastB )
	{
		if ( $ai <= $lastA && $bi <= $lastB )
		{

			# Change
			if ( defined $changeCallback )
			{
				&$changeCallback( $ai++, $bi++, @_ );
			}
			else
			{
				&$discardACallback( $ai++, $bi, @_ );
				&$discardBCallback( $ai, $bi++, @_ );
			}
		}
		elsif ( $ai <= $lastA )
		{
			&$discardACallback( $ai++, $bi, @_ );
		}
		else
		{

			# $bi <= $lastB
			&$discardBCallback( $ai, $bi++, @_ );
		}
	}

	return 1;
}

sub LCS
{
	my $a = shift;                                           # array ref
	my $matchVector = _longestCommonSubsequence( $a, @_ );
	my @retval;
	my $i;
	for ( $i = 0 ; $i <= $#$matchVector ; $i++ )
	{
		if ( defined( $matchVector->[$i] ) )
		{
			push ( @retval, $a->[$i] );
		}
	}
	return wantarray ? @retval : \@retval;
}

sub diff
{
	my $a      = shift;    # array ref
	my $b      = shift;    # array ref
	my $retval = [];
	my $hunk   = [];
	my $discard = sub { push ( @$hunk, [ '-', $_[0], $a->[ $_[0] ] ] ) };
	my $add = sub { push ( @$hunk, [ '+', $_[1], $b->[ $_[1] ] ] ) };
	my $match = sub { push ( @$retval, $hunk ) if scalar(@$hunk); $hunk = [] };
	traverse_sequences( $a, $b,
		{ MATCH => $match, DISCARD_A => $discard, DISCARD_B => $add }, @_ );
	&$match();
	return wantarray ? @$retval : $retval;
}

sub sdiff
{
	my $a      = shift;    # array ref
	my $b      = shift;    # array ref
	my $retval = [];
	my $discard = sub { push ( @$retval, [ '-', $a->[ $_[0] ], "" ] ) };
	my $add = sub { push ( @$retval, [ '+', "", $b->[ $_[1] ] ] ) };
	my $change = sub {
		push ( @$retval, [ 'c', $a->[ $_[0] ], $b->[ $_[1] ] ] );
	};
	my $match = sub {
		push ( @$retval, [ 'u', $a->[ $_[0] ], $b->[ $_[1] ] ] );
	};
	traverse_balanced(
		$a,
		$b,
		{
			MATCH     => $match,
			DISCARD_A => $discard,
			DISCARD_B => $add,
			CHANGE    => $change,
		},
		@_
	);
	return wantarray ? @$retval : $retval;
}

1;
}
import Algorithm::Diff qw(traverse_sequences);
# End of inserted block for stand-alone version