#!env python
# Language Acquisition project, June 2007, UvA.
"""
	OVERVIEW

# Important data structures:
# - Dictionary with words as keys, and lists tuples (amount, framehash) as
#   values
eg. associations['ball'] = [ (0.45, _frameXYZ), (0.33, _frameABC), ... ]

# - Dictionary with framehashes as keys, and the real frames as values
frameindex[hash1234] = _frameXYZ

#Initialization functions
readcorpus()
parseutterances()

#Print functions
printframe()
printprop()
printsituation()
frametostring() # create a unique, sorted string

#Learn Functions
createsubframes() <-- formerly known as 'abstractions'
associate() <-- formerly known as 'speech'

#Test Functions
done in main?
"""
from xml.dom import minidom
from sys import stdin
import math

def main():
	"""
	then read words from stdin and find matching frames
	(if this file is not called directly, main() will be ignored).
	"""
	#banner/silly disclaimer
	#	
	print """Language Acquisition, one-word model. 2nd Year project UvA 2007
This program is not distributed in the hope that it will be useful,
so WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
"""
	#
	xmldoc = minidom.parse("nicecorpus.xml").documentElement
	# maybe replace derivemeanings() with a single for-loop
	for frame in xmldoc.getElementsByTagName("frame"):
		try:
			frame.setAttribute("name", "")
		except:
			pass
	associations, frameindex = oneword(xmldoc)
	#
	# read words from stdin and find frames
	print "Talk to me: ",
	text = stdin.readline()
	for word in text.split():
		if word in associations:
			data = associations[word]
			print word
			list = [(data[fhash], fhash) for fhash in data]
			list.sort(reverse=True)
# temporarily show the complete list for debugging
			for i, match in enumerate(list[:5]):
#			for i, match in enumerate(list):
				print "match", i + 1, "score:", match[0]
				printframe(frameindex[match[1]])
		else:
			print word, "not in corpus."
	#clean up
	xmldoc.unlink()
	return	#pass on

def oneword(xmldoc):
	"""
	parse corpus and generate derived frames,
	parameter: use: xmldoc = minidom.parse("corpus.xml").documentElement
	return: a tuple (associations, frameindex)
	"""
	#parse XML
	print "oneword: Reading corpus data.."
	situations = xmldoc.getElementsByTagName("situation")
	associations, frameindex = {}, {}
	#
	#generate derived frames
	print "oneword: Analyzing situations.."
	for sit in situations:
		utterances = parseutterances(sit, associations)
		meanings = derivemeanings(sit, frameindex)
		associate(utterances, meanings, associations, method=0)
#		associate(utterances, meanings, associations, method=1)
	#
	#try to correct scores
#	print "oneword: Correcting scores.."
#	correctassociations(associations, len(utterances), len(meanings))
	return (associations, frameindex)

def derivemeanings(situation, frameindex):
	"""
	for a given situation, return a list of derived meanings.
	"""
	def abstractiontoid(frame):
		"""
		try to rename abstraction to an id element and delete the
		old id element, otherwise return an empty list.
		"""
		aframe = frame.cloneNode(deep=1)
		try:
			a = abst(frames(aframe).next()).childNodes[0]
		except StopIteration:
			return  []
		b = id(frames(aframe).next()).childNodes[0]
		id(frames(aframe).next()).replaceChild(a.cloneNode(deep=1), b)
		frames(aframe).next().removeChild(abst(frames(aframe).next()))
		return [aframe]
	#
	def recursiveframes(frame):
		"""
		recursively generate derived frames
		"""			
		solutions = []
		for a in frames(frame):
			newframe = Kreator.createElement("meaning")
			sub = newframe.appendChild(a.cloneNode(deep=1))
# removed this line because we do a global replace in the main
#			sub.setAttribute("name", "sub")
			for b in properties(frame):
				newframe.appendChild(b.cloneNode(deep=1))
			solutions.extend(abstractiontoid(newframe))
			solutions.append(newframe)
			solutions += recursiveframes(a)
		for a in properties(frame):
			newframe = Kreator.createElement("meaning")
			newframe.appendChild(a.cloneNode(deep=1))
			solutions.append(newframe)
		return solutions
	#
	#kludge to create new elements
	Kreator = minidom.Document()
	try:
		originalframe = frames(situation).next()
	except StopIteration:	
		originalframe = situation
	#
	# printframe(originalframe)
	fullmeaning = Kreator.createElement("meaning")
	fullmeaning.appendChild(originalframe.cloneNode(deep=1))
	emptyframe = Kreator.createElement("frame")
#	emptyframe.setAttribute("name", "action")
	emptyframe.setAttribute("name", "")
	try:
		emptyframe.appendChild(id(originalframe).cloneNode(deep=1))
		emptyframe.appendChild(abst(originalframe).cloneNode(deep=1))
	except StopIteration:
		pass
	emptymeaning = Kreator.createElement("meaning")
	emptymeaning.appendChild(emptyframe)	
	solutions = [fullmeaning, emptymeaning]
	solutions.extend(abstractiontoid(fullmeaning))
	solutions.extend(abstractiontoid(emptymeaning))
	# find subframes, ehh, lower stuff, et cetera
	solutions += recursiveframes(originalframe)
	return makehashes(solutions, frameindex)

def associate(utterances, meanings, associations, method=0): 
	"""
	compute scoring between words and frames
	if the method argument > 0 unrelated frames
	will have their associations decreased.
	"""
	for a in utterances:
		for b in meanings:
			if not b in associations[a]:
				associations[a][b] = 2
			else: #TODO: insert weird formula here
				#monotically increasing: 
				associations[a][b] += 2
		# statistics below seem to degrade performance (at least in
		# some situation, eg. 'ball'), so stopped using for now:
#		method = 1
		if method > 0:
			for c in associations[a]:
				if c not in meanings:
					#assume other frames are unrelated to this word:
					associations[a][b] -= 1

def correctassociations(associations, wordcount, meaningcount):
	"""
	do some math to correct associatons
	"""
	for word, data in associations.items():
		total = sum([data[framehash] for framehash in data])
		for framehash in data:
			data[framehash] = float(data[framehash]) / total
	#remove words that appear to often (assume they bear no semantic information)
	#for word in associations:
	#	word count divided by number of sentences
	#
	#logistic function?:
	#associations[a][b] = 1.0 / (1.0 + math.e ** (-1.0 * associations[a][b]))
	#
	return	
	
def parseutterances(situation, associations):
	"""
	Create a list of single words of all the adult utterances combined,
	after stripping unwanted characters. If seperate utterances are needed
	change this.  As this is the oneword stage, sentence boundaries are
	currently meaningless.
	"""
	utterance = " ".join( adultutterances(situation) )
	utts = utterance.replace("?", "").split()
        #
        # words starting with a '!' are emphasized; we can use this in
        # different ways, for example ignoring all other words, or increasing
        # the number of the emphasized words
        
	#method = 'increase3'
        #method = 'ignore'
        method = 'default'
        correctedwords = []
        #
        if method == 'ignore': # use only emphasized words
                for a in utts:
                        if a[0] == '!':
                                correctedwords.append(a[1:])
        #
        elif method == 'increase3': # triple emphasized words
                for a in utts:
                        if a[0] == '!':
                                correctedwords.append(a[1:])
                                correctedwords.append(a[1:])
                                correctedwords.append(a[1:])
                        else:
                                correctedwords.append(a)
        #
        else: # method == 'default'
                correctedwords = [a.replace("!","") for a in utts]
        #
        # make sure the words in associations are known beforehand
        for a in correctedwords:
                if not a in associations:
                        associations[a] = {}
        return correctedwords
	
### Section Auxilary Functions
def makehashes(meanings, frameindex):
	output = []
	for a in meanings:
		b = framehash(a)
		if not b in frameindex:
			frameindex[b] = a
		output.append(b)
	return output

# Print Functions
def printsituation(situation):
	"""
	print a situation's description, frames and utterances
	"""
	description = situation.getElementsByTagName("description")[0]
	print " DESC:",description.childNodes[0].data
	for a in frames(situation):
		printframe(a)
	for a in situation.childNodes:
		if a.nodeName == "adult":
			print "adult:", a.childNodes[0].data
		elif a.nodeName =="child":
			print "child:", a.childNodes[0].data
		else:
			pass

def printframe(frame):
	print frametostr(frame)
	#end with ruler to signify end of top level frame
	print 79 * '-'

def frametostr(frame, nesting=0, removename=False):
	"""
	make 'human readable' string of a frame, for both pretty-printing
	and finding duplicates.
	"""
	result = []
	if frame.nodeName == "meaning":
		result = "MEANING:\n"
	elif frame.nodeName == "frame":
		result = nesting * '\t' + "FRAME: "
		if removename == False:
			result += frame.getAttribute("name")+ '\n'
		else:
			result += "void\n"
	elif frame.nodeName == "situation":
		result = "SITUATION:\n"
	try:	
		result += (nesting+1)*'\t' + "ID: " + id(frame).childNodes[0].data + '\n'
		result += (nesting+1)*'\t' + "ABSTR: " + abst(frame).childNodes[0].data + '\n'
		#FIXME: result += (nesting+1)*'\t' + "WORDORDER: " + elementiterator("wordorder", frame).next().childNodes[0].data + '\n'
	except StopIteration:
		pass #not all frames need to have an abstraction element
	def cmppropkeys(prop1, prop2):
		if prop1.getAttribute("name") < prop2.getAttribute("name"):
			return True
		else:
			return False
	props = list(properties(frame))
	props.sort(cmppropkeys)
        for a in props:
		result += (nesting + 1) * '\t' + "PROP: "
		result += a.getAttribute("name") + " = "
		result += a.childNodes[0].data + '\n'
	for a in sorted(frames(frame)):
		result += frametostr(a, nesting + 1)
	return "".join(result)

# Readabilty functions
def framehash(frame):
	"""
	generate a hash value of a frame, by converting it to a string
	representation and hashing that. The conversion is used to do a
	"deep" conversion, instead of just comparing object references.
	"""
	return hash(frametostr(frame, removename=True))

def id(frame):
	return elementiterator("id", frame).next()
def abst(frame):
	return elementiterator("abstraction", frame).next()
	
def frames(frame):
	return elementiterator("frame", frame)
def properties(frame):
	return elementiterator("prop", frame)
def adultutterances(frame):
	""" return utterances as strings (hacky code) """
	#return [a.childNodes[0].data for a in elementiterator("adult", frame)]
	for a in elementiterator("adult", frame):
		yield a.childNodes[0].data

def elementiterator(tag, frame):
	""" iterator over elements with a specific tag in a frame """
	for a in frame.childNodes:
		if a.nodeName == tag:
			yield a
### Start the program
if __name__ == "__main__":
    main()
