#!/usr/bin/perl # Package to parse arguments use Getopt::Std; # DB package #use MLDBM qw(DB_File Storable); use Fcntl; use strict; # Data dumper for debug use Data::Dumper; # Serializing functions use Storable qw(nstore store_fd nstore_fd freeze thaw dclone); use Digest::MD5 qw(md5 md5_hex md5_base64); # Show version version(); # Get options my %options=(); getopts("i:t:",\%options) or usage(); my $input_file; our %stats; # Parsing arguments if ($options{'i'}) { $input_file = $options{'i'}; } else { usage(); } # # Indexes: # # Hash: # word -> id # md5(frame) -> frame # # # M2M: # word_id -> frame_id # frame_id -> word_id # # # # Open input file and parse it if (!( -f $input_file)) { die "Not such file: $input_file"; } open(IN, "<$input_file") or die "Error opening file: $input_file"; my $line; my $finished = 0; my $frame; my %frames; my $frame_ref; my %words; my @frame_md5; my %frame_words; # Scan every line while(($line = ) && ($finished == 0)) { next if $line =~ /^#/; next if $line =~ /^[\ \t]*$/; # Handle EOF marker if (substr($line, 0, 4) eq "\@EOF") { $finished = 1; next; } if (substr($line, 0, 1) eq "\@") { # TODO: Parse action $frame = readaction($line); $frame_ref = abstractions($frame); $stats{'learned_frames'}++; undef @frame_md5; foreach $frame (@{$frame_ref}) { my $hash = md5hash($frame); push @frame_md5, $hash; if (not exists ($frames{$hash})) { $frames{$hash} = $frame; $stats{'unique_abstracted_frames'}++; } $stats{'learned_abstracted_frames'}++; } } # A '>' indicates input to a 'child' if (substr($line, 0, 1) eq ">") { # TODO: Parse speech $stats{'learned_speech_lines'}++; my $ref = readspeech($line, \%words, \@frame_md5, \%frame_words); } # A '<' indicate output of a 'child' if (substr($line, 0, 1) eq "<") { # TODO: Parse output speech? } } close(IN); print "Learning fase finished ...\n"; print Dumper(\%stats); ############################# # Check if we want to do some testing if ($options{'t'}) { $input_file = $options{'t'}; } else { exit; } print "Testing fase ...\n"; open(IN, "<$input_file") or die "Error opening file: $input_file"; $finished = 0; while(($line = ) && ($finished == 0)) { next if $line =~ /^#/; next if $line =~ /^[\ \t]*$/; if (substr($line, 0, 1) eq "\@") { # TODO: Parse action $frame = readaction($line); $frame_ref = abstractions($frame); # ArjanAndreas # print Dumper($frame_ref); undef @frame_md5; foreach $frame (@{$frame_ref}) { my $hash = md5hash($frame); push @frame_md5, $hash; if (not exists ($frames{$hash})) { $frames{$hash} = $frame; $stats{'test_unique_abstracted_frames'}++; } $stats{'test_abstracted_frames'}++; } speech(\@frame_md5, \%frame_words, \%words); } } close(IN); sub speech { my $frame_ref = $_[0]; my $frame_words_ref = $_[1]; my $words_ref = $_[2]; my %calc; undef %calc; my $top = 0; my $topword = ""; foreach my $md5 (@{$frame_ref}) { my $ref = ${$frame_words_ref}{$md5}{'words'}; foreach my $word (keys %{$ref}) { if (not exists $calc{$word}) { $calc{$word}{'count'} = 0; } $calc{$word}{'count'} += ${$frame_words_ref}{$md5}{'words'}{$word}; } } # my $tot = 0; # foreach my $k (keys %{$words_ref}) { # $tot += ${$words_ref}{$k}{'count'}; # } my %calc2; foreach my $word (keys %calc) { my $total = ${$words_ref}{$word}{'count'}; #$calc{$word}{'total'} = $total; my $divided = $calc{$word}{'count'} / $total; # my $divided = $calc{$word}{'count'} - $total; # my $divided = $calc{$word}{'count'} / $tot; # my $divided = $total / $calc{$word}{'count'} / $total; # ArjanAndreas # if ($total > 10 && $total < 45) { if ($total > 10) { $calc2{$word} = $divided; } # $calc2{$word} = $total; } my $c = 0; foreach my $row (sort { $calc2{$b} <=> $calc2{$a} } keys %calc2 ) { printf("%-20s %s\n", $row, $calc2{$row}); $c++; if ($c == 10) { print ".\n"; return; } } print ".\n"; } # Read speech sub readspeech { my $line = $_[0]; my $words_ref = $_[1]; my $frame_ref = $_[2]; my $frame_words_ref = $_[3]; my @words; # Filter $line = substr($line, 1); # Chop first char (it's a > sign) $line =~ s/\?//g; # Ignore some chars $line =~ s/\!//g; $line =~ s/ +/ /g; $line =~ s/^ //; $line =~ s/ $//; @words = split(" ", $line); foreach my $word (@words) { if (not exists ${$words_ref}{$word}) { ${$words_ref}{$word}{'count'} = 1; $stats{'unique_words'}++; $stats{'learned_words'}++; } else { ${$words_ref}{$word}{'count'} += 1; $stats{'learned_words'}++; } foreach my $md5 (@{$frame_ref}) { if (not exists ${$frame_words_ref}{$md5}{'words'}{$word}) { ${$frame_words_ref}{$md5}{'words'}{$word} = 1; } else { ${$frame_words_ref}{$md5}{'words'}{$word}++; } if (not exists ${$frame_words_ref}{$md5}{'count'}) { ${$frame_words_ref}{$md5}{'count'} = 1; } else { ${$frame_words_ref}{$md5}{'count'}++; } } } return \@words; } # Read an action sub readaction { my $line = $_[0]; my $data = ""; my $brackets = 0; my $finished = 0; # This parts continues to read the whole frame while($finished == 0) { my $tmp1 = $line; my $tmp2 = $line; $tmp1 =~ s/[^{]//g; $tmp2 =~ s/[^}]//g; $brackets += (length($tmp1) - length($tmp2)); # Add this line to data $data .= $line; if ($brackets == 0) { $finished = 1; } else { # Read next line if (eof (IN)) { die("Parse error, '}' not found before EOF"); } $line = ; } } # $data now contains the whole frame # next is to analize the frame and # store it in an hash table. # Replace returns by spaces my $value = tokenize($data, 0); # TODO: Make some uniq ID for value (md5?) # TODO: Split in smaller parts and store? return $value; } # This function creates all abstractions from a # frame and returns a reference to an array of # them. # sub abstractions() { # Initial fram my $frame = $_[0]; # Two framesets my @set; my @set2; # Reference my $ref; # String of abstraction my $abstraction = ""; # Inital, start with one; my $option1 = deepcopy($frame); my $option2; # Add current frame to set push @set, $option1; # For all keys, foreach my $key (keys(%{$frame})) { # If it's an hash, recure in it if (ref(${$frame}{$key}) eq "HASH") { # Reset @set2, undef @set2; # For all current sets foreach my $current (@set) { # Recurse and get abstractions $ref = abstractions(${$current}{$key}); # Using this reference, add all new # abstraction to set foreach my $current2 (@{$ref}) { my $new = deepcopy($current); my $new2 = deepcopy($current2); my $new3 = deepcopy($current); ${$new}{$key} = $current2; delete ${$new3}{$key} ; push @set2, $new; push @set2, $new2; # Chopped @ end push @set2, $new3; # Chopped @ begin } } # copy @set = @set2; } else { # save abstraction if ($key eq "abstraction") { $abstraction = ${$frame}{$key}; } } } # Found an abstraction if ($abstraction ne "") { undef @set2; foreach my $current (@set) { $option1 = deepcopy($current); $option2 = deepcopy($current); ${$option2}{'id'} = $abstraction; delete ${$option2}{'abstraction'}; push @set2, $option1; push @set2, $option2; } @set = @set2; } undef @set2; foreach my $current (@set) { my $i = scalar keys %{$current}; if ($i > 0) { push @set2, $current; } } return \@set2; } # TODO: Could be better to sort the hash first in some # way, so the frozen hash is the same for hashes with # the same content (but maybe different order) sub md5hash() { my $h = $_[0]; my $level = $_[1]; my $str = ""; if (!defined $level) { $level = 0; } foreach my $k (sort (keys %{$h})) { if (ref ${$h}{$k} eq "ARRAY" || ref ${$h}{$k} eq "HASH") { $str .= "$k:{"; $str .= md5hash(${$h}{$k}, $level+1); $str .= "},"; } else { $str .= "$k:" . ${$h}{$k} .","; } } if ($level == 0) { return md5_hex($str); } else { return $str; } } # This function tokenizes a block # it uses recursion sub tokenize() { my $data = $_[0]; my $level = $_[1]; my $i = 0; my $brackets = 0; my $key = ""; my $value = ""; my $key_done = 0; my %set; my $max = length($data); # Loop true data for($i = 0; $i < length($data); $i++) { # Active char my $char = substr($data, $i, 1); # If key name not set yet, we # assume this is about the key # name. this will finish after a ":" # is found if ($key_done == 0) { if ($char eq ":") { $key_done = 1; $key =~ s/[ |\t|\n]//g; next; } else { $key .= $char; } } # if the key name is done # then the rest will be a value # untill a ";" is found or a "{ }" # block is closed if ($key_done == 1) { if ($char eq ";" || $char eq "{") { $key_done = 0; $value =~ s/[ |\t|\n]//g; #print "Found value: $level $value\n"; $set{$key} = $value; } else { $value .= $char; } # This parts retrieves the whole {} block if ($char eq "{") { my $data_new = ""; $brackets = 1; my $finished = 0; while($finished == 0) { $i++; my $char = substr($data, $i, 1); if ($char eq "{") { $brackets++; } if ($char eq "}") { $brackets--; } if ($brackets == 0) { my $val = tokenize($data_new, $level+1); $set{$key} = $val; $set{$key}{'id'} = $value; $finished = 1; } else { $data_new .= $char; } } $key_done = 0; } if ($char eq ";" || $char eq "{") { $value = ""; $key = ""; } } } return \%set; } # Helper functions sub deepcopy { my $this = shift; if (not ref $this) { $this; } elsif (ref $this eq "ARRAY") { [map deepcopy($_), @$this]; } elsif (ref $this eq "HASH") { +{map { $_ => deepcopy($this->{$_}) } keys %$this}; } else { die "what type is $_?" } } # Meta info sub version { print STDERR "eend version 0.1.5\n"; print STDERR "use -h for help on this program\n\n"; } sub usage { print STDERR "eend [ -i file ] [ -t file ]\n"; print STDERR " -i input filename\n"; print STDERR " -t test training on file\n"; exit; }