#!/usr/bin/python2.5
import nltk, pickle
from esperantotagger import tag

""" report frequencies of adjectives and nominalizations of that adjective for a
    given stem. """
def nominalization(stem, word_fd):
	adj = (stem+'a', 'JJ')
	maladj = ('mal'+stem+'a', 'JJ')
	nom1 = (stem+'o', 'NNS')
	nom2 = (stem+'eco', 'NNS')
	malnom1 = ('mal'+stem+'o', 'NNS')	
	malnom2 = ('mal'+stem+'eco', 'NNS')	
	def f(a):
		return word_fd[a] + word_fd[(a[0]+'n',a[1])] + word_fd[(a[0]+'j',a[1])] + word_fd[(a[0]+'jn',a[1])]
	def g(adj, nom1, nom2):
		freq = [f(c) for c in (adj, nom1, nom2)]
		freq += [freq[1]+freq[2]]
		if freq[0] != 0: freq += [freq[-1] / float(freq[0])]
		else: freq += [-1]
		return tuple(freq)
	print " %s      adj     nom+o   nom+eco nom-sum ratio" % stem
	print " normal  %d      %d      %d      %d      %f" % g(adj,nom1,nom2)
	print " ant.    %d      %d      %d      %d      %f" % g(maladj,malnom1,malnom2)

def malvortoj(word_fd):
	malwords = set(uninflect(word) for word, tag in word_fd.keys() if word[:3] == 'mal')
	onlymal = [] 
	nevermal = []

def uninflect(word):
	if word[-2:] == 'jn': return word[:-2]
	if word[-1] == 'n': return word[:-1]
	if word[-1] == 'j': return word[:-1]
	return word

def pos_tag(sent):
	return [(word, tag(word)) for word in sent]

def preprocess(document):
	sentences = nltk.tokenize.punkt.PunktSentenceTokenizer(document).tokenize(document)
	sentences = [nltk.word_tokenize(sent) for sent in sentences]
	sentences = [pos_tag(sent) for sent in sentences]
	return sentences

def malegc(reader):
	words = (uninflect(word.lower()) for word in reader.words)
	mal = [word for word in words if word[:3] == 'mal']
	maladj = [word for word in mal if word[-1] in 'ae'] #plus adverbs
	malverb = [word for word in mal if word[-1] =='i' or word[-2:] in 'asisos' or word[-3:-1] in 'itatot' or word[-4:-1] in 'antintont']
	malnoun = [word for word in mal if word[-1] == 'o']
	#[...]

def malfreq(reader):
	fdist = nltk.FreqDist(uninflect(word.lower()) for word in 
		reader.words())
	mal = dict((word, fdist[word]) for word in fdist if word[:3]=='mal')
	#nonmal = dict((word[3:], fdist[word[3:]]) for word in mal)
	print 'word	malfreq	nonmalfreq'
	for f, word in sorted(((f,w) for w,f in mal.items()), reverse=True):
		print '%s	%d	%d' % (word, f, fdist[word[3:]])

def freqspect(reader):
	fdist = nltk.FreqDist(uninflect(word.lower()) for word in 
		reader.words())
	mal = dict((word, fdist[word]) for word in fdist if word[:3]=='mal')
	nonmal = dict((word[3:], fdist[word[3:]]) for word in mal)
	#maladj = [word for word in mal if word[-1] in 'ae'] #plus adverbs
	#malverb = [word for word in mal if word[-1] in 'ui' or word[-2:] in 'asisosus' or word[-3:-1] in 'itatot' or word[-4:-1] in 'antintont']
	#malnoun = [word for word in mal if word[-1] == 'o']
	#ca = nltk.FreqDist(fdist[word] for word in maladj)
	#cv = nltk.FreqDist(fdist[word] for word in malverb)
	#cn = nltk.FreqDist(fdist[word] for word in malnoun)
	ctot = nltk.FreqDist(mal.values())
	cnot = nltk.FreqDist(nonmal.values())
	
	print "freq	tot	adj	verb	noun"
	#for freq in sorted(ctot):
	for freq in range(1, max(ctot.keys()) + 1):  #sorted(ctot):
		print "%d 	%d	%d\n" % (freq, ctot[freq], cnot[freq]),
		#for b in (ca, cv, cn):
		#	if b[freq] > 0: print '	%d' % b[freq],
		#	else: print '	0',
		#print

reader = nltk.corpus.reader.PlaintextCorpusReader(
	'/home/andreas/books/esperanto', '.*', encoding = 'utf-8',
	sent_tokenizer=pickle.load(open('esperanto.pickle')))
malfreq(reader)
#freqspect(reader)
#reader = preprocess('\n'.join(open(a).read() for a in os.listdir('/home/andreas/books/esperanto/')))
#fdist = nltk.FreqDist((word.lower(),tag) for word,tag in reader.tagged_words())


#for a in 'grand tut bon kar sam bel jun long nov alt'.split(): 
#	nominalization(a,fdist)
#	print