#!/usr/bin/python2.5
# import punkt
import nltk.tokenize.punkt

# Make a new Tokenizer
tokenizer = nltk.tokenize.punkt.PunktSentenceTokenizer()

# Read in trainings corpus (one example: Slovene)
import codecs
text = codecs.open("/tmp/eoall.txt","Ur","utf-8").read()

# Train tokenizer
tokenizer.train(text)

# Dump pickled tokenizer
import pickle
out = open("esperanto.pickle","wb")
pickle.dump(tokenizer, out)
out.close()

