#!/usr/bin/perl


# Package to parse arguments
use Getopt::Std;

# DB package
#use MLDBM qw(DB_File Storable);
use Fcntl;
use strict;

# Data dumper for debug
use Data::Dumper;

# Serializing functions
use Storable qw(nstore store_fd nstore_fd freeze thaw dclone);
use Digest::MD5 qw(md5 md5_hex md5_base64);

# Show version
version();

# Get options
my %options=();
getopts("i:t:",\%options) or usage();

my $input_file;
our %stats;

# Parsing arguments
if ($options{'i'}) {
        $input_file = $options{'i'};
}
else {
	usage();
}

# 
# Indexes:
#
# Hash:
# word -> id
# md5(frame) -> frame
# 
#
# M2M:
# word_id -> frame_id
# frame_id -> word_id
#
#
# 



# Open input file and parse it
if (!( -f $input_file)) {
	die "Not such file: $input_file";
}

open(IN, "<$input_file") or die "Error opening file: $input_file";

my $line;
my $finished = 0;
my $frame;
my %frames;
my $frame_ref;
my %words;
my @frame_md5;
my %frame_words;

# Scan every line
while(($line = <IN>) && ($finished == 0)) {
	next if $line =~ /^#/;
	next if $line =~ /^[\ \t]*$/;

	# Handle EOF marker
	if (substr($line, 0, 4) eq "\@EOF") {
		$finished = 1;
		next;
	}

	if (substr($line, 0, 1) eq "\@") {
		
		# TODO: Parse action
		$frame = readaction($line);
		$frame_ref = abstractions($frame);
		$stats{'learned_frames'}++;

		undef @frame_md5;
		foreach $frame (@{$frame_ref}) {
			my $hash = md5hash($frame);
			push @frame_md5, $hash;
			if (not exists ($frames{$hash})) {
				$frames{$hash} = $frame;
				$stats{'unique_abstracted_frames'}++;
			}
			$stats{'learned_abstracted_frames'}++;
		}
	}

		
	# A '>' indicates input to a 'child'
	if (substr($line, 0, 1) eq ">") {
		# TODO: Parse speech
		$stats{'learned_speech_lines'}++;
		my $ref = readspeech($line, \%words, \@frame_md5, \%frame_words);
	}

	# A '<' indicate output of a 'child'
	if (substr($line, 0, 1) eq "<") {
		# TODO: Parse output speech?
	}

}
close(IN);


print "Learning fase finished ...\n";
print Dumper(\%stats);


#############################


# Check if we want to do some testing
if ($options{'t'}) {
        $input_file = $options{'t'};
}
else {
	exit;

}

print "Testing fase ...\n";
open(IN, "<$input_file") or die "Error opening file: $input_file";

$finished = 0;
while(($line = <IN>) && ($finished == 0)) {
	next if $line =~ /^#/;
	next if $line =~ /^[\ \t]*$/;

	if (substr($line, 0, 1) eq "\@") {
		
		# TODO: Parse action
		$frame = readaction($line);
		

		$frame_ref = abstractions($frame);
# ArjanAndreas
#		print Dumper($frame_ref);

		undef @frame_md5;
		foreach $frame (@{$frame_ref}) {	
			my $hash = md5hash($frame);
			push @frame_md5, $hash;
			if (not exists ($frames{$hash})) {
				$frames{$hash} = $frame;
				$stats{'test_unique_abstracted_frames'}++;
			}
			$stats{'test_abstracted_frames'}++;
		}
		speech(\@frame_md5, \%frame_words, \%words);


	}

}


close(IN);



sub speech {
	my $frame_ref = $_[0];
	my $frame_words_ref = $_[1];
	my $words_ref =  $_[2];

	my %calc;
	undef %calc;
	my $top = 0;
	my $topword = "";

	
	foreach my $md5 (@{$frame_ref}) {
		
		my $ref = ${$frame_words_ref}{$md5}{'words'};
		foreach my $word (keys %{$ref}) {
			if (not exists $calc{$word}) {
				$calc{$word}{'count'} = 0;
			}
			$calc{$word}{'count'} += ${$frame_words_ref}{$md5}{'words'}{$word};
		}
	}

#	my $tot = 0;
#	foreach my $k (keys %{$words_ref}) {
#		$tot += ${$words_ref}{$k}{'count'};
#	}

	my %calc2;
	foreach my $word (keys %calc) {
		my $total = ${$words_ref}{$word}{'count'};
		#$calc{$word}{'total'} = $total;
		my $divided = $calc{$word}{'count'} / $total;
#		my $divided = $calc{$word}{'count'} - $total;
#		my $divided = $calc{$word}{'count'} / $tot;
#		my $divided = $total / $calc{$word}{'count'} / $total;

# ArjanAndreas
#		if ($total > 10 && $total < 45) {	
		if ($total > 10) {	
			$calc2{$word} = $divided;
		}
#		$calc2{$word} = $total;
	}

	my $c = 0;
	foreach my $row (sort { $calc2{$b} <=> $calc2{$a} } keys %calc2 ) {
		printf("%-20s %s\n", $row, $calc2{$row});
		$c++;
		
		if ($c == 10) {
			print ".\n";
			return;	
		}

	}
	print ".\n";

}



# Read speech
sub readspeech {
	my $line = $_[0];
	my $words_ref = $_[1];
	my $frame_ref = $_[2];
	my $frame_words_ref = $_[3];
	my @words;

	# Filter
	$line = substr($line, 1);  # Chop first char (it's a > sign)
	$line =~ s/\?//g; # Ignore some chars
	$line =~ s/\!//g;
	$line =~ s/ +/ /g;
	$line =~ s/^ //;
	$line =~ s/ $//;

	@words = split(" ", $line);

	foreach my $word (@words) {
		if (not exists ${$words_ref}{$word}) {
			${$words_ref}{$word}{'count'} = 1;
			$stats{'unique_words'}++;
			$stats{'learned_words'}++;
		}
		else {
			${$words_ref}{$word}{'count'} += 1;
			$stats{'learned_words'}++;
		}

		foreach my $md5 (@{$frame_ref}) {
			if (not exists ${$frame_words_ref}{$md5}{'words'}{$word}) {
				${$frame_words_ref}{$md5}{'words'}{$word} = 1;
			}
			else {
				${$frame_words_ref}{$md5}{'words'}{$word}++;
			}

			if (not exists ${$frame_words_ref}{$md5}{'count'}) {
				${$frame_words_ref}{$md5}{'count'} = 1;
			}
			else {
				${$frame_words_ref}{$md5}{'count'}++;
			}
		}

	}
	return \@words;	
}



# Read an action
sub readaction {
	my $line = $_[0];

	my $data = "";
	my $brackets = 0;
	my $finished = 0;


	# This parts continues to read the whole frame
	while($finished == 0) {
		my $tmp1 = $line;
		my $tmp2 = $line;

		$tmp1 =~ s/[^{]//g;
		$tmp2 =~ s/[^}]//g;
	
		$brackets += (length($tmp1) - length($tmp2));

		# Add this line to data
		$data .= $line;

		if ($brackets == 0) {
			$finished = 1;
		}
		else {
			# Read next line
			if (eof (IN)) {	
				die("Parse error, '}' not found before EOF");
			}
			$line = <IN>;
		}
	}
	
	# $data now contains the whole frame
	# next is to analize the frame and
	# store it in an hash table.

	# Replace returns by spaces	

	my $value = tokenize($data, 0);

	# TODO: Make some uniq ID for value (md5?)
	# TODO: Split in smaller parts and store?
	

	return $value;

}



# This function creates all abstractions from a
# frame and returns a reference to an array of
# them.
# 
sub abstractions() {

	# Initial fram
	my $frame = $_[0];
	
	# Two framesets	
	my @set;
	my @set2;

	# Reference
	my $ref;
	
	# String of abstraction
	my $abstraction = "";

	# Inital, start with one;
	my $option1 = deepcopy($frame);
	my $option2;

	# Add current frame to set
	push @set, $option1;



	# For all keys,
	foreach my $key (keys(%{$frame})) {

		# If it's an hash, recure in it
		if (ref(${$frame}{$key}) eq "HASH") {
	
			# Reset @set2,
			undef @set2;

			# For all current sets
			foreach my $current (@set) {

				# Recurse and get abstractions
				$ref = abstractions(${$current}{$key});
			
				# Using this reference, add all new
				# abstraction to set
				foreach my $current2 (@{$ref}) {
					my $new = deepcopy($current);
					my $new2 = deepcopy($current2);
					my $new3 = deepcopy($current);
					${$new}{$key} = $current2;
					delete ${$new3}{$key} ;
					push @set2, $new;
					push @set2, $new2;   # Chopped @ end
					push @set2, $new3;   # Chopped @ begin
				}
			}
			# copy
			@set = @set2;
		}
		else {
			# save abstraction
			if ($key eq "abstraction") {
				$abstraction = ${$frame}{$key};
			}
		}

	}

	# Found an abstraction	
	if ($abstraction ne "") {
		undef @set2;
		foreach my $current (@set) {
			$option1 = deepcopy($current);
			$option2 = deepcopy($current);
			${$option2}{'id'} = $abstraction;
			delete ${$option2}{'abstraction'};
			push @set2, $option1;
			push @set2, $option2;
		}
		@set = @set2;
	}

	undef @set2;
	foreach my $current (@set) {
		my $i = scalar keys %{$current};
		if ($i > 0) {
			push @set2, $current;
		}
	}
	return \@set2;

	

}


# TODO: Could be better to sort the hash first in some
# way, so the frozen hash is the same for hashes with
# the same content (but maybe different order)
sub md5hash() {
	my $h = $_[0];
	my $level = $_[1];
	my $str = "";

	if (!defined $level)  {
		$level = 0;
	}

	foreach my $k (sort (keys %{$h})) {

		if (ref ${$h}{$k} eq "ARRAY" || ref ${$h}{$k} eq "HASH") {
			$str .= "$k:{";
			$str .= md5hash(${$h}{$k}, $level+1);
			$str .= "},";
		}
		else {
			$str .= "$k:" . ${$h}{$k} .",";
		}
	}

	if ($level == 0) {
		return md5_hex($str);
	}
	else {
		return $str;
	}
}



# This function tokenizes a block
# it uses recursion
sub tokenize() {
	my $data = $_[0];
	my $level = $_[1];


	my $i = 0;
	my $brackets = 0;
	my $key = "";
	my $value = "";
	my $key_done = 0;

	my %set;
	my $max = length($data);

	# Loop true data
	for($i = 0; $i < length($data); $i++) {

		# Active char
		my $char = substr($data, $i, 1);

		# If key name not set yet, we
		# assume this is about the key
		# name. this will finish after a ":"
		# is found
		if ($key_done == 0) {
			if ($char eq ":") {
				$key_done = 1;
				$key =~ s/[ |\t|\n]//g;
				next;
		
			}
			else {	
				$key .= $char;
			}
		}

		# if the key name is done
		# then the rest will be a value
		# untill a ";" is found or a "{ }" 
		# block is closed
		if ($key_done == 1) {


			if ($char eq ";" || $char eq "{") {
				$key_done = 0;
				$value =~ s/[ |\t|\n]//g;
				#print "Found value: $level $value\n";
				$set{$key} = $value;

			} 
			else {
				$value .= $char;
			}


			# This parts retrieves the whole {} block
			if ($char eq "{") {
				my $data_new = "";
				$brackets = 1;
				my $finished = 0;
				while($finished == 0) {
					$i++;

					my $char = substr($data, $i, 1);
					if ($char eq "{") {
						$brackets++;
					}

					if ($char eq "}") {
						$brackets--;
					}

					if ($brackets == 0) {
						my $val = tokenize($data_new, $level+1);

						$set{$key} = $val;
						$set{$key}{'id'} = $value;
							
						$finished = 1;
					}
					else {
						$data_new .= $char;
					}
				}
				$key_done = 0;

			}


			if ($char eq ";" || $char eq "{") {

				$value = "";
				$key = "";
			}


		}
		
		
	}

	return \%set;

	

}



# Helper functions
sub deepcopy {
	my $this = shift;
	if (not ref $this) {
		$this;
	} elsif (ref $this eq "ARRAY") {
		[map deepcopy($_), @$this];
	} elsif (ref $this eq "HASH") {
		+{map { $_ => deepcopy($this->{$_}) } keys %$this};
	} else { die "what type is $_?" }
}




# Meta info
sub version {
        print STDERR "eend version 0.1.5\n";
        print STDERR "use -h for help on this program\n\n";
}

sub usage {
	print STDERR "eend [ -i file ] [ -t file ]\n";
	print STDERR "  -i input filename\n";
	print STDERR "  -t test training on file\n";
	exit;
}
