#! /usr/bin/perl
#
#
if(grep(/^\-(h|\-help)$/i, @ARGV))
{
print STDOUT << "ENDOFHELP" ;
DPalign.pl

Aligns targetstring with sourcestring using a Dynamic
Programming algorithm with edit distance

Use:
DPalign.pl [--verbose] [--split <pattern|n>] [--license] [--help] 'source' 'target'

--verbose
-v 
Print all information, including the distance and backtrace matrices

--license
-l
Print license information

--help
-h
This message

--split <pattern|n> 
-s <pattern|n> 
The pattern used for the perl 'split' operator. Default is
'' which splits the input into single characters. Pure numbers
are interpreted as indicating the number of characters in each
"block". E.g., '--split 2' is translated into '(..)' which has
the effect of chunking the input into pairs of characters,
i.e., 'intention' -> 'in' 'te' 'nt' 'io' 'n'. This is useful
for two character scripts like Worldbet.

Returns:
(with --verbose option)
Distance matrix
Backtrace matrix

(always)
Distance
Alignment
ENDOFHELP
exit;
};

#
#
###############################################################################
if(grep(/^\-(l|\-license)$/i, @ARGV))
{
print STDOUT << "ENDOFLICENSE" ;

Copyright R.J.J.H. van Son © 20002

Author Rob van Son
Institute of Phonetic Sciences & ACLC
University of Amsterdam
Herengracht 338
NL-1016CG Amsterdam, The Netherlands
Email: Rob.van.Son@hum.uva.nl
	   rob.van.son@workmail.com
WWW  : http://www.fon.hum.uva.nl/rob/
mail:  Institute of Phonetic Sciences
	   University of Amsterdam
	   Herengracht 338
	   NL-1016CG Amsterdam
	   The Netherlands
	   tel +31 205252183
	   fax +31 205252197

License for use and disclaimers

This program is free software; you can redistribute it and/or
modify it under the terms of the GNU General Public License
as published by the Free Software Foundation; either version 2
of the License, or (at your option) any later version.

This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
GNU General Public License for more details.

You should have received a copy of the GNU General Public License
along with this program; if not, write to the Free Software
Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111-1307, USA.


ENDOFLICENSE
exit;
};
#######################################################
#
# Get options
# --verbose
while(grep(/^\-(v|\-verbose)$/i, @ARGV))
{
	$verbose = 1;
	@ARGV = grep(!/^\-(v|\-verbose)$/i, @ARGV);
};

# --split 'match-value'
my $split = '';
while(grep(/^\-(s|\-split)$/i, @ARGV))
{
	my $i;
	for($i=0;$i<scalar(@ARGV);++$i)
	{
		if($ARGV[$i] =~ /^\-(s|\-split)$/i)
		{
			$split = $ARGV[$i+1];
			splice(@ARGV, $i, 2);
			last;
		};
	};
	@ARGV = grep(!/^\-(s|\-split)$/i, @ARGV);
};
# A simple separation into character blocks (i.e., -s 2 means 2 character symbols)
$split = "(".("."x$split).")" unless $split =~ /[^\d]/;

# Get arguments from ARGV, split them immediately 
# into lists of characters
my @source = grep /[\S\s]/, split /$split/, shift(@ARGV);
my @target = grep /[\S\s]/, split /$split/, shift(@ARGV);

# Add boundary symbol to source and target
unshift(@source, "\#");
unshift(@target, "\#");

# Define and initialize variables
my @Distance = ();

my @BackPointer = ();

my ($i, $j) = (0, 0);

# Get the position of the minimum of the arguments
# In case of a non-unique minimum, return the first
sub minimumarg	# ($a, $b, $c, ...) -> Argument Number of minimum
{
	my $argmin = 0;
	my  $i = 0;
	# Iterate over the argument list (@_)
	for($i=0; $i<scalar(@_); ++$i)
	{
		$argmin = $i if $_[$i] < $_[$argmin]
	};
	
	return $argmin;
}

$inscost = 1;
$delcost = 1;
$subcost = 2;

my @backpointersymbols = ('S', 'I', 'D');

# Initialize matrices with simple values
$Distance[0][0] = 0;
$BackPointer[0][0] = $backpointersymbols[0];
for($i=1; $i<scalar(@source); ++$i)
{
	$Distance[$i][0] = $i;
	$BackPointer[$i][0] = $backpointersymbols[1];
};
for($j=1; $j<scalar(@target); ++$j)
{
	$Distance[0][$j] = $j;
	$BackPointer[0][$j] = $backpointersymbols[2];
};


# Fill distance and backpointer matrices
for($i=1; $i<scalar(@source); ++$i)
{
	for($j=1; $j<scalar(@target); ++$j)
	{
		$currentsubcost = $source[$i] eq $target[$j] ? 0 : $subcost;
		my @currentDistances = (
					$Distance[$i-1][$j-1] + $currentsubcost,
					$Distance[$i-1][$j] + $inscost,
					$Distance[$i][$j-1] + $delcost);
		my $minarg = minimumarg(@currentDistances);
		$Distance[$i][$j] = $currentDistances[$minarg];
		$BackPointer[$i][$j] = $backpointersymbols[$minarg];
	};
};

#
# Ready, print out results

# The matrices, only in verbose mode
if($verbose)
{
	print "Distance matrix:\n";
	for($i=$#source; $i>=0; --$i)
	{
		print $source[$i], "\t", join("\t", @{$Distance[$i]}), "\n";
	};
	print "\t", join("\t", @target), "\n";

	print "\n";

	print "Traceback matrix:\n";
	for($i=$#source; $i>=0; --$i)
	{
		print $source[$i], "\t", join("\t", @{$BackPointer[$i]}), "\n";
	};
	print "\t", join("\t", @target), "\n\n";
};

$i = $#source;
$j = $#target;
my @targetout = ();
my @sourceout = ();
my $InsDelSymbol = "*".' 'x (length($source[1])-1);
while($i > 0 || $j > 0)
{
	if($BackPointer[$i][$j] eq $backpointersymbols[0])
	{
		unshift(@sourceout, $source[$i]);
		unshift(@targetout, $target[$j]);
		--$i;
		--$j;
	}
	elsif($BackPointer[$i][$j] eq $backpointersymbols[1])
	{
		unshift(@sourceout, $source[$i]);
		unshift(@targetout, $InsDelSymbol);
		--$i;
	}
	elsif($BackPointer[$i][$j] eq $backpointersymbols[2])
	{
		unshift(@sourceout, $InsDelSymbol);
		unshift(@targetout, $target[$j]);
		--$j;
	}
	else
	{
		die "Error, invalid backpointer at ($i, $j)\n";
	};
};

print "Total distance: ", $Distance[$#source][$#target], "\n";

print "Alignment\n";
print "Source:\t@sourceout\n";
print "Target:\t@targetout\n";
