esperantotagger

1 #!/usr/bin/env python 2 # -*- coding: UTF-8 -*- 3 4 # Rule-based tagger for Esperanto. Andreas van Cranenburgh, 2008 5 # 6 # Tagset based on Penn treebank tagset, see: 7 # ftp://ftp.cis.upenn.edu/pub/treebank/doc/tagguide.ps.gz 8 import codecs, operator, locale, sys 9 # hacks to do UTF-8 from stdin/out 10 #sys.stdout = codecs.EncodedFile(sys.stdout, 'utf-8') 11 sys.stdout = codecs.getwriter(locale.getpreferredencoding())(sys.stdout) 12 sys.stderr = codecs.getwriter(locale.getpreferredencoding())(sys.stderr) 13 14 #Closed word classes: 15 #personal pronouns 16 PRP = set(u"mi vi li ŝi ĝi ni ili oni si ci mem".split()) 17 #possesive pronouns 18 PRPd = set(u"mia via lia ŝia ĝia nia ilia onia sia cia".split()) 19 #closed class adverbs [that do not end with -e] 20 RB = set(u"tie ie ĉie nenie tiam iam ĉiam neniam iel tiel ĉiel neniel ial tial ĉial iom ĉiom tiom neniom ajn mem jen ja tamen eĉ dum jam ĵus plu ĉi for nun nur tre tro tuj ne jes ju des".split()) 21 #wh-adverbs 22 WRB = set(u"kiel kiam kial kiom".split()) 23 #coordinating conjunctions 24 CC = set(u"kaj aŭ nek kvankam sed ĉar".split()) 25 #prepositions and subordinating conjunctions 26 IN = set(u"antaŭ malantaŭ anstataŭ do dum en laŭ krom kun malgraŭ se ke al apud ĉe ĉirkaŭ kontraŭ preter de da je el ĝis ekster inter po por post per pri sen sub super sur tra trans".split()) 27 #determiners 28 DT = set(u"la kiu iu ĉiu tiu kies ies ties ĉies kia tia ia ĉia".split()) 29 #numbers written out 30 NUMBER = set("unu du tri kvar kvin ses sep ok naŭ dek cent mil dudek tridek kvardek kvindek sesdek sepdek okdek naŭdek ducent tricent kvarcent kvincent sescent sepcent okcent naŭcent dumil trimil kvarmil kvinmil sesmil sepmil okmil naŭmil dekmil centmil".split()) 31 JJR = set("pli malpli ol plej".split()) 32

33 -def preprocess(file):

34 corpus = lower(codecs.open(file, encoding='utf-8').read()) 35 #split punctuation: 36 corpus = corpus.replace(',', ' ,').replace('.', ' .').replace('"', 37 ' " ').replace(';', ' ;').replace('---', ' --- ').replace('--', 38 ' -- ').replace(u'—', ' -- ').replace('?', ' ?').replace('!', 39 ' !') 40 41 for sent in corpus.split('.'): 42 print '(S', 43 for word in sent.split(): 44 print '(%s %s)' % (tag(word), word), 45 print ')'

46

47 -def preprocessmorph(file):

48 corpus = lower(codecs.open(file, encoding='utf-8').read()) 49 #split punctuation: 50 corpus = corpus.replace(',', ' ').replace('.', ' ').replace('"', 51 ' " ').replace(';', ' ').replace('---', ' ').replace('--', 52 ' ').replace(u'—', ' -- ').replace('?', ' ').replace('!', 53 ' ') 54 from nltk import FreqDist 55 fd = FreqDist(corpus.split()) 56 for word in fd: 57 print '(%s %s)' % (tag(word), word)

58

59 -def main():

60 #filename to read: 61 corpus = "/dev/stdin" 62 #read corpus as one string: 63 corpus = codecs.open(corpus, encoding='utf-8').read() 64 #split punctuation: 65 corpus = corpus.replace(',', ' ,').replace('.', ' .').replace('"', 66 ' " ').replace(';', ' ;').replace('---', ' --- ').replace('--', 67 ' -- ').replace(u'—', ' -- ').replace('?', ' ?').replace('!', 68 ' !') 69 70 #first split lines, then split words: 71 corpus = [a.split() for a in corpus.splitlines()] 72 73 tagged, untagged = [], [] 74 75 for line in corpus: 76 result = [] 77 for word in line: 78 result.append((word, tag(word))) 79 if tag(word) == u'none': 80 untagged.append(word) 81 tagged.append(result) 82 83 for line in tagged: 84 for word, wtag in line: 85 print "(%s %s)" % (wtag, word), 86 print 87 88 sys.stderr.write("untagged words:\n") 89 for a in set(untagged): sys.stderr.write("%s " % a) 90 91 N = sum([len(line) for line in corpus]) 92 u = len(untagged) 93 sys.stderr.write("\n\n word count %d, tagged %d = %f%%, untagged %d\n" 94 % (N, N - u, (N - u) / float(N) * 100, u))

95

96 -def tag(word1):

97 word = word1.lower() 98 99 #non-words or numbers 100 if len(word) <= 1: return "SYM" 101 if word in NUMBER or word.isdigit(): return "CD" 102 if not any(a.isalpha() for a in word): return word 103 104 #elision is only allowed on article la or nouns: 105 if word == "l'": return "DT" 106 if word[-1] in u"'’": return "NN" 107 #other punctuation is not part of word: 108 if not word[-1].isalpha(): return tag(word1[:-1]) 109 if not word[0].isalpha(): return tag(word1[1:]) 110 111 if word == 'nu' or word == 'jen': return "UH" 112 if word in CC: return "CC" 113 if word in IN: return u"IN" 114 115 #tag uninflected forms: 116 if word[-1] == 'n': return tag(word[:-1]) 117 if word[-1] == 'j': return tag(word[:-1]) 118 119 if word in DT: return "DT" 120 if word in PRP: return u"PRP" 121 if word in PRPd: return u"PRP" 122 123 if word in WRB: return "RB" 124 if word in RB: return "RB" 125 126 if word in JJR: return "JJ" 127 128 if word == "kio": return "WDT" 129 if word == "kiu": return "WP" 130 if word == "kies": return "WP" 131 132 #words like hodiaŭ, preskaŭ etc. 133 if word[-2:] == u'aŭ': return "RB" 134 135 #open word classes: 136 if word[-1] == 'a': return "JJ" 137 if word[-1] == 'e': return "RB" #this includes "ne"! 138 if word[-1] == 'i': return "VB" 139 if word[-1] == 'o': return "NN" 140 if word[-1] == 'u': return "VB" 141 142 #verbs 143 if word[-1:] == 's': 144 if word[-2] == 'a': return "VB" 145 if word[-2] == 'i': return "VB" 146 if word[-2] == 'o': return "VB" 147 if word[-2] == 'u': return "VB" 148 149 150 #proper nouns / names 151 if capitalized(word1): 152 if word1[-1] == "j": return u"NN" 153 else: return "NN" 154 155 return u"none"

156

157 -def capitalized(word):

158 #esperanto alphabet including non-esperanto western characters 159 if word[0].isupper() or word[0] in u"ĈĜĤĴŜŬ": 160 return True 161 return False

162

163 -def lower(word):

164 return word.lower().replace(u'Ĉ',u'ĉ').replace(u'Ĝ',u'ĝ').replace(u'Ĥ', 165 u'ĥ').replace(u'Ĵ',u'ĵ').replace(u'Ŝ',u'ŝ').replace(u'Ŭ',u'ŭ')

166

167 -def uninflect(word):

168 if word[-2:] == 'jn': return word[:-2] 169 if word[-1] == 'n': return word[:-1] 170 if word[-1] == 'j': return word[:-1] 171 return word

172 173 #sort dictionary by value, highest first

174 -def dictsort(d):

175 a = d.items() 176 #operator.itemgetter(1) returns the 2nd element of a sequence (in this case, value associated with key) 177 a.sort(key=operator.itemgetter(1)) 178 a.reverse() 179 return a

180 181 if __name__ == '__main__': 182 preprocessmorph('dualibro.txt') 183 #main() 184

Source Code for Module esperantotagger