#Andreas v Cranenburgh 0440949
#Ricus Smid 9660208

#pos tagging (assignment 5), May 2008.

import sys, operator, re, string

def main():
	#corpus files. format is "w1/t1 w2/t2 ..." etc.
	TRAIN = "WSJ02_21.pos"
	TEST =  "WSJ23.pos"

	#k parameter for Katz smoothing. counts of 1..k will be smoothed.
	k = 4
	#language model uses trigrams, so n = 3:
	n = 3

	#don't tag sentences with more than 'maxlen' words
	maxlen = 15
	
	#flags, one can benchmark each type of smoothing individually.
	smoothlang = True
	smoothlex = True

	#end of start-up parameters. no user-servicable code below.

	print "n = ", n
	print "k = ", k

	#read corpora, apply regular expressions to filter out unwanted
	#punctuation, then separate words with split()
	#split() reduces sequences of whitespace to one separator by default
	traincorpus = open(TRAIN).read()
	traincorpus = processcorpus(traincorpus)

	tags = [a[1] for a in traincorpus]
	print "training corpus loaded"

	langmod = makengram(tags, 3)
	print "trigram language model created"
	
	#vocabulary size, number of unique ngrams in corpus
	voc = len(langmod)
	
	#make histogram with counts of counts
	countvalues = histogram(langmod)
	unigramtags = dict(((a[0],b) for (a,b) in makengram(tags, 1).items()))
	tagvoc = len(unigramtags)
	
	#number of possible 0-events
	n0count = tagvoc ** n - len(langmod)
	#smoothed count for 0-events
	n0 = countvalues[1] / float(n0count)

	#do smoothing
	if smoothlang:
		langmodkatz = discountGTkatz(langmod.copy(), countvalues, k)	
		print "katz smoothing of language model done"
	else:
		langmodkatz = langmod

	langmod1 = calculatenminusgram(langmod, n0, voc)
	langmodkatz1 = calculatenminusgram(langmodkatz, n0, voc)
	print "bigram language model created"
	
	for a in langmod:
		langmod[a] = langmod[a] / float(langmod1[a[:-1]])
		langmodkatz[a] = langmodkatz[a] / float(langmodkatz1[a[:-1]])
	print "trigram counts divided by bigram counts"
	
	lex = makengram(traincorpus, 1)
	print "lexical model created"
	
	wordtagdict = createwordtagdict(lex, unigramtags)
	print "word-tag dictionary created"
	
	#smoothing for lexical model:
	lexkatz = lex.copy()
	if smoothlex:
		for a,b in lex.items():
			if b == 1:
				lexkatz[a] = 0.5

		#create a dictionary of how many times a tag occurs once with words
		tagonce = {}
		for ((word, tag),), c in lex.items():
			if c == 1:
				try:
					tagonce[tag] += 1
				except KeyError:
					tagonce[tag] = 1
	
		#create a dictionary with zero scores for tags
		tagzerocounts = {}
		for tag in tagonce:
			# skip "PERIOD" as it is not a real tag
			if tag == "PERIOD": continue
			tagzerocounts[tag] = 0.5 * tagonce[tag] / unigramtags[tag]
		del tagonce
	
	lexmod = {}; lexmodkatz = {}
	if smoothlex:
		for a in lex:
			lexmod[a[0]] = lex[a] / float(unigramtags[a[0][1]])
			lexmodkatz[a[0]] = lexkatz[a] / float(unigramtags[a[0][1]])
		print "lexical model smoothed"
	else:	
		for a in lex:
			lexmod[a[0]] = lex[a] / float(unigramtags[a[0][1]])
		lexmodkatz = lexmod

	del lex, lexkatz

	testcorpus = open(TEST).read()
	#test corpus should be a list of sentences
	testcorpus = processcorpus(testcorpus, True)
	print "test corpus loaded"


	#finally, run the models on the training corpus
	nosmoothing = evaluatetagger(langmod, langmod1, lexmod, unigramtags, wordtagdict, {}, n, 0, voc, maxlen, testcorpus)
	smoothed = evaluatetagger(langmodkatz, langmodkatz1, lexmodkatz, unigramtags, wordtagdict, tagzerocounts, n, n0, voc, maxlen, testcorpus)
	
	print "without smoothing: precision = %f, recall = %f, \n\tcorrect: %d, not tagged: %d, N: %d" % nosmoothing
	print "katz smoothing: precision = %f, recall = %f, \n\tcorrect: %d, not tagged: %d, N: %d" % smoothed
	#end-of-program

#read test corpus and guesstimate tags.
def evaluatetagger(langmod, langmod1, lexmod, unigramtags, wordtagdict, tagzerocounts, n, n0, voc, maxlen, testcorpus):
	correct, nottagged, N = 0, 0, 0
	oncetags = [a for a, b in dictsort(tagzerocounts)]
	#trick to remove duplicates from list:
	oncetags = {}.fromkeys(oncetags).keys()
	#mem = {}
	for sentence in testcorpus:
		#skip sentences longer than x (eg., 15) words
		if len(sentence) > maxlen: continue
		# bounded exhaustive algorithm
		correct1, nottagged1, N1 = tagsentence(langmod, langmod1, lexmod, unigramtags, wordtagdict, tagzerocounts, oncetags, n, n0, voc, sentence)
		#attempt at viterbi:
		#correct1, nottaged1, N1 = tagviterbi(langmod, langmod1, lexmod, unigramtags, wordtagdict, tagzerocounts, oncetags, n, n0, voc, sentence)
		correct += correct1; nottagged += nottagged1; N += N1
	
	print

	#N = number of words (excluding sentences > 15 words)
	recall = float(correct) / N * 100
	precision = float(correct) / (N - nottagged) * 100
	return precision, recall, correct, nottagged, N

def tagviterbi(langmod, langmod1, lexmod, unigramtags, wordtagdict, tagzerocounts, oncetags, n, n0, voc, sentence):
	sentence1 = [c[0] for c in sentence]
	#tags = gettags(sentence, wordtagdict, oncetags, tagzerocounts)
	a = forward_viterbi(sentence1, unigramtags.keys(), langmod1, langmod, lexmod)
	print a[1]
	return score(sentence, a[1])

def tagsentence(langmod, langmod1, lexmod, unigramtags, wordtagdict, tagzerocounts, oncetags, n, n0, voc, sentence):
	maxz, maxtags = 0, []
	
	tags = gettags(sentence, wordtagdict, oncetags, tagzerocounts)
	if tags == []: return 0, 0, 0
	sentence1 = [c[0] for c in sentence]
	
	tags = limittags(tags, 2**24)
	for i, a in enumerate(cartesian_product(*tags)):
		#try at most 2**15 tag sequences. this means that for
		#a sentence of 15 words, for every word two tags will
		#be considered (in the worst case that every word has at least
		#2 possible tags, that is)
		#if i == 2**15: break
		x = sentenceprobability(langmod, langmod1, n, n0, voc, a)
		#x = altsentprob1(langmod, langmod1, mem, n, n0, voc, ('PERIOD',) + a + ('PERIOD',))
		if x == 0:
			#skip this tag sequence if it has zero probability
			continue

		try:
			y = lexmod[(sentence1[0], a[0])]
		except KeyError:
			try:
				y = tagzerocounts[a[0]]
			except KeyError:
				if n0 == 0: continue
				else: y = n0
		for b in zip(sentence1, a)[1:]:
			try:
				y *= lexmod[b]
			except KeyError:
				try:
					y *= tagzerocounts[b[1]]
				except KeyError:
					if n0 == 0: y = 0
					else: y *= n0
		z = x * y
		#z = y / x
		if z > maxz:
			maxz = z
			maxtags = a
	print i + 1, '\t',	#show how many tag sequences were tried
	
	return score(sentence, maxtags)

def gettags(sentence, wordtagdict, oncetags, tagzerocounts):
	tags = []
	for word, tag in sentence:
		try:
			tags.append(wordtagdict[word])
		except KeyError:
			#new word, no tags found. in case we are smoothing,
			#use tags that occured once with some words, otherwise
			#use None as 'tag'.
			if not tagzerocounts == {}:
				#if there is capitalized word, assume it is a proper noun
				if word[0] in string.uppercase:
					tags.append(['NNP'])
				else:
					tags.append(oncetags)
			else:
				tags.append([None])
	return tags
	
def score(sentence, maxtags):
	correct, nottagged, N = 0, 0, 0
	nottagged += len(sentence)
	for a, b in zip(maxtags, sentence):
		if a == b[1]:
			correct += 1
			nottagged -= 1
			print "/".join(b),
		elif a == None:
			print '*', "/".join(b), "[--]",
		else:
			print "/".join(b), "[" + a + "]",
			nottagged -= 1
	N += len(sentence)
	print #print "|",
	sys.stdout.flush()

	return correct, nottagged, N

#make only a sequence with a subset of most likely tags for some sentence
def limittags(tagseq, maxtags):
	#calculate number of possible tag sequences:
	n = reduce(operator.mul, [len(a) for a in tagseq])
	a = 0
	while n > maxtags:
		if len(tagseq[a]) > 1:
			n = n / len(tagseq[a])
			tagseq[a] = tagseq[a][:-1]
			n = n * len(tagseq[a])
		#increment index, wrap around upon reaching end of list
		a = (a + 1) % len(tagseq)	
	return tagseq
		
#return a dictionary of possible tags for each word,
#given a lexical model with word/tag pairs as keys
def createwordtagdict(lexmod, unigramtags):
	result = {}
	#gather all possible tags for each word:
	for ((word, tag),) in lexmod.keys():
		try:
			if tag not in result[word]:
				result[word].append(tag)
		except KeyError:
			result[word] = [tag]

	#sort tags by frequency, highest first:
	for word in result:
		result[word] = sorted(result[word], key=unigramtags.__getitem__, reverse=True)

	return result

#derive a unigram table from a bigram table, by counting seen bigrams, plus
#possible bigrams not yet seen, for every first word in each bigram.
#if the bigram table is smoothed, the resulting unigram will be as well.
def calculatenminusgram(ngram, n0, voc):
	result = {}
	#count of all possible 0-bigrams with given vocabulary
	#this is the assumed count of an unigram before it is observed
	#in the bigram table. Note the use of voc - 1 since unigrams
	#will always occur at least once if they are in the table.
	nullbigramcounts = (voc - 1) * n0
	for a in ngram:
		try:
			#add bigram count and subtract n0
			result[a[:-1]] += ngram[a] - n0
		except KeyError:
			#initialize with nullbigramcounts + the bigram count
			result[a[:-1]] = nullbigramcounts + ngram[a]
	return result

#make ngrams for a certain n
def makengram(corpus, n):
	#dictionary that will contain the ngram scores
	ngram = {}

	#iterate through corpus and count each ngram
	for index in range(len(corpus) - n + 1):
		words = corpus[index:index + n]
		if "PERIOD" in words:
			ind = words.index("PERIOD")
			#add PERIOD's at start of ngram if needed
			if 0 < ind < (n - 1):
				words[0:ind] = ["PERIOD"] * ind
		words = tuple(words)
		#count ngrams
		try:
			ngram[words] += 1
		except KeyError:
			ngram[words] = 1
	return ngram

#smooth all counts with Good-Turing method
def discountGTall(ngram, cv):
	newngram = {}
	#note: if count+1 is 0 the resulting value will also be 0
	for word, count in ngram.items():
		newngram[word] = (count + 1) * (cv[count + 1] / float(cv[count]))
	return newngram

#only smooth counts < k with Good-Turing method
def discountGTkatz(ngram, cv, k):
	newngram = {}
	#calculate part of discount formula outside for-loop,
	#since it is constant:
	r2 = ((k+1) * cv[k + 1]) / float(cv[1])
	for words, count in ngram.items():
		if count <= k:
			r1 = (count + 1) * (cv[count + 1] / float(cv[count]))
			newngram[words] = (r1 - count * r2) / float((1 - r2))
		#no discount for counts > 5
		else:
			newngram[words] = count

	return newngram

#calculate the sentence probabilty according to the ngram Markov model
def sentenceprobability(ngrams, nminusgram, n, n0, voc, seq1):
	seq = ("PERIOD",) + seq1 + ("PERIOD",)
	#freq = 1
	#initial frequency is bigram frequency:
	try:
		freq = nminusgram[seq[:n-1]]
	except KeyError:
		freq = n0
	for a in range(n, len(seq)+1):
		rseq = seq[a-n:a]
		try:
			relfreq = ngrams[rseq]
		except KeyError:
			try:
				relfreq = n0 / nminusgram[rseq[:-1]]
			except KeyError:
				#if word was not in trainingcorpus the rel freq is n0/voc*n0 = 1/voc
				if n0 != 0:
					relfreq = 1.0/voc
				else:
					#print "tri/bigram not found", rseq
					return 0
		#end inline version "relativefrequency"
		
		if relfreq == 0:
			#print "trigram not found", rseq
			return 0

		freq *= relfreq
	return freq

#memoizing iterative sentence probability calculator
def altsentprob1(ngrams, nminusgram, mem, n, n0, voc, seq1):
	seq = ("PERIOD",) + seq1 + ("PERIOD",)
	#maximum seq length to memoize:
	maxmem = 5
	#freq = 1
	#initial frequency is bigram frequency:
	try:
		freq = nminusgram[seq[:2]]
	except KeyError:
		freq = n0
	start = 2

	for a in range(len(seq), maxmem):
		try:
			freq = mem[seq[:a]]
			start = a
			break
		except KeyError:
			pass
	
	for a in range(start, len(seq)+1):
		# inline version of "relativefrequency" for performance:
		rseq = seq[a-n:a]
		try:
			relfreq = ngrams[rseq]
		except KeyError:
			try:
				relfreq = n0 / nminusgram[rseq[:-1]]
			except KeyError:
				#if word was not in trainingcorpus the rel freq is n0/voc*n0 = 1/voc
				relfreq = 1.0/voc
		#end inline version "relativefrequency"

		freq *= relfreq
		if a < maxmem:
			mem[seq[:a]] = freq
	return freq

#memoizing recursive sentence probability calculator
def altsentprob(ngrams, nminusgram, mem, n, n0, voc, seq):
	if len(seq) == 2:
		try:
			a = nminusgram[seq]
		except KeyError:
			a = n0
		return a
	else:
		#if seq in mem:
		#	return mem[seq]
		if not True:
			pass
		else:
			try:
				a = ngrams[seq[-n:]]
			except KeyError:
				try:
					a = n0 / nminusgram[seq[-n+1:]]
				except KeyError:
					a = 1.0 / voc
			a *= altsentprob(ngrams, nminusgram, mem, n, n0, voc, seq[:-n])
			#if len(seq) < 5:
			#	mem[seq] = a
			return a

#create a hashtable with counts of word counts
def histogram(ngram):
	maxscore = max(ngram.values()) + 2
	#initialize the dictionary with zero counts
	hist = dict(zip(range(1, maxscore), maxscore * (0,)))
	for a in ngram.values():
		hist[a] += 1
	return hist

#sort dictionary by value, highest first
def dictsort(d):
	a = d.items()
	#operator.itemgetter(1) returns the 2nd element of a sequence
	a.sort(key=operator.itemgetter(1))
	a.reverse()
	return a


def processcorpus(corpus, splitsentences=False):
	#first add end-of-sentence markers:
	corpus = corpus.replace('./.', ' PERIOD/PERIOD ')
	
	if splitsentences:
		#split corpus into a list of sentences:
		corpus = [a for a in corpus.split(" PERIOD/PERIOD ")]
		#for each sentence, split it into words, 
		#split those words into word/tag tuples
		#filter out tags that do not start with an alphanumeric char.
		return [[c for c in [tuple(b.rsplit('/', 1)) for b in a.split() if '/' in b] if c[1][0] in string.uppercase] for a in corpus]
	else:
		#split into list of words
		corpus = corpus.split()
		#now split "word/tag" pairs into (word, tag) tuples:
		corpus = [tuple(a.rsplit('/', 1)) for a in corpus if '/' in a]
		#remove noise and tags for punctuation.
		return [a for a in corpus if a[1][0] in string.uppercase]

#cartesian product of arbitrary number of arguments. borrowed from: 
# http://automatthias.wordpress.com/2007/04/28/cartesian-product-of-multiple-sets/
#call this as "cartesian_product(*listoflists)". Do not forget that asterisk.
def cartesian_product(L,*lists):
	if not lists:
		for x in L:
			yield (x,)
	else:
		for x in L:
			for y in cartesian_product(lists[0],*lists[1:]):
				yield (x,)+y

#As borrowed / adapted from:
# http://en.wikipedia.org/wiki/Viterbi_algorithm
#this is currently vaporware. need to adapt code to 2nd order markov model.
#currently expects 1st order model.
def forward_viterbi(y, X, sp, tp, ep):
	T = {}
	for state in X:
		##          prob.      V. path  V. prob.
		try:
			T[state] = (sp[('PERIOD', state)], [state], sp[('PERIOD', state)])
		except KeyError:
			T[state] = (1e-5, [state], 1e-5)
		
	for output in y:
		U = {}
		for next_state in X:
			total = 0
			argmax = None
			valmax = 0
			for source_state in X:
				for source_state1 in X:
					(prob, v_path, v_prob) = T[source_state]
					try:	
						p = ep[(output, next_state)] * tp[source_state + next_state]
					except KeyError:
						p = 1e-5	
					prob *= p
					v_prob *= p
					total += prob
					if v_prob > valmax:
						argmax = v_path + [next_state]
						valmax = v_prob
				U[next_state] = (total, argmax, valmax)
		T = U
	## apply sum/max to the final states:
	total = 0
	argmax = None
	valmax = 0
	for state in X:
		(prob, v_path, v_prob) = T[state]
		total += prob
		if v_prob > valmax:
			argmax = v_path
			valmax = v_prob
	return (total, argmax, valmax)


#"""
if __name__ == '__main__':
	# Import Psyco if available (optimizing JIT compiler)
	try:
		import psyco
		psyco.full()
	except ImportError:
		pass
	main()
#"""

#instead of the above, this can be used to enable profiling:
"""
import hotshot
from hotshot import stats
prof = hotshot.Profile("hotshot_edi_stats")
prof.runcall(main)
prof.close()
s=stats.load("hotshot_edi_stats")
s.sort_stats("time").print_stats()
"""