morph

1 #!/usr/bin/python 2 # -*- coding: UTF-8 -*- 3 """ An application of Data-Oriented Parsing to Esperanto. 4 Combines a syntax and a morphology corpus. """ 5 6 from dopg import * 7 from nltk import UnsortedChartParser, NgramModel 8 from bitpar import BitParChartParser 9 from random import sample,seed 10 seed() 11

12 -def chapelitoj(word): #todo: replace capitals as well Ĉ Ĝ Ĥ Ĵ Ŝ Ŭ

13 return unicode(word).replace(u'cx',u'ĉ').replace(u'gx',u'ĝ').replace(u'hx', 14 u'ĥ').replace(u'jx',u'ĵ').replace(u'sx',u'ŝ').replace(u'ux',u'ŭ') 15

16 -def malchapelitoj(word): #todo: replace capitals as well Ĉ Ĝ Ĥ Ĵ Ŝ Ŭ

17 return unicode(word).replace(u'ĉ', u'cx').replace(u'ĝ', u'gx').replace(u'ĥ', 18 u'hx').replace(u'ĵ', u'jx').replace(u'ŝ', u'sx').replace(u'ŭ', u'ux') 19

20 -def cnf(tree):

21 """ make sure all terminals have POS tags; 22 invent one if necessary ("parent_word") """ 23 result = tree.copy(True) 24 for a in tree.treepositions('leaves'): 25 if len(tree[a[:-1]]) != 1: 26 result[a] = Tree("%s_%s" % (tree[a[:-1]].node, tree[a]), [tree[a]]) 27 return result

28

29 -def stripfunc(tree):

30 """ strip all function labels from a tree with labels of 31 the form "function:form", eg. S:np for subject, np. """ 32 for a in tree.treepositions(): 33 if isinstance(tree[a], Tree) and ':' in tree[a].node: 34 tree[a].node = tree[a].node.split(':')[1] 35 return tree

36

37 -def dos(words):

38 """ `Data-Oriented Segmentation 1': given a sequence of segmented words 39 (ie., a sequence of morphemes), produce a dictionary with extrapolated 40 segmentations (mapping words to sequences of morphemes). 41 Assumes non-ambiguity. 42 Method: cartesian product of all possible morphemes at position 0..n, where n is maximum word length.""" 43 l = [len(a) for a in words] 44 morph_at = dict((x, set(a[x] for a,n in zip(words, l) if n > x)) 45 for x in range(0, max(l))) 46 return dict(("".join(a), a) for a in 47 reduce(chain, (cartpi([morph_at[x] for x in range(n)]) 48 for n in range(min(l), max(l)))))

49 -def dos1(words):

50 """ `Data-Oriented Segmentation 2': given a sequence of segmented words 51 (ie., a sequence of morphemes), produce a dictionary with extrapolated 52 segmentations (mapping words to sequences of morphemes). 53 Discards ambiguous results. 54 Method: cartesian product of all words with the same number of morphemes. """ 55 l = [len(a) for a in words] 56 return dict(("".join(a), a) for a in 57 reduce(chain, (cartpi(zip(*(w for w, m in zip(words, l) if m == n))) 58 for n in range(min(l), max(l)))))

59 -def dos2(words):

60 #bigram model. there must be a way to avoid precomputing this? 61 #this is not working yet has I haven't succeeded in getting nltk's 62 #smoothing implemantion's to work. 63 #also, a better way would be to systematically try all possibilities 64 #and store the ones with a probability above a threshold 65 #(there must be an algorithm for doing this efficiently) 66 model = NgramModel(2, words) 67 lengths = map(len, words) 68 # iterate over possible number of morphemes 69 for n in range(2, max(lengths)+1): 70 # sample as many words as there are words with this number of morphemes 71 for m in lengths.count(n): 72 yield model.generate(n)

73

74 -def dos3(words):

75 # regex tokenizer. TBD 76 pass

77

78 -def segmentor(segmentd):

79 """ wrap a segmentation dictionary in a naive unknown word 80 segmentation function with some heuristics 81 (phonological rules could probably improve this further) """ 82 83 # this unspeakable hack is necessary because python does not have 84 # proper support for lexical closures. 85 def s(segmentd): 86 def f(w): 87 """ consult segmentation dictionary with fallback to rule-based heuristics. """ 88 try: return segmentd[w] 89 #naive esperanto segmentation (assume root of the appropriate type) 90 except KeyError: 91 if w[-1] in 'jn': return f(w[:-1]) + (w[-1],) 92 if w[-1] in 'oaeu': return (w[:-1], w[-1]) 93 if w[-1] == 's': return (w[:-2], w[-2:]) 94 if w[-1] == 'i': return (w[:-1], w[-1]) 95 #last resort, unanalyzable word, e.g. proper noun 96 return (w,)

97 return f 98 return s(segmentd) 99

100 -def morphmerge(tree, md, segmented):

101 """ merge morphology into phrase structure tree """ 102 copy = tree.copy(True) 103 for a,w in zip(tree.treepositions('leaves'), segmented): 104 try: 105 # MPD: copy[a[:-1]] = md.removeids(md.parse(w))[0] 106 # MPP 107 copy[a[:-1]] = md.mostprobableparse(w)[0] 108 except Exception as e: 109 print "word:", tree[a[:-1]][0], "segmented", w 110 print "error:", e 111 return copy

112

113 -def morphology(train):

114 """ an interactive interface to the toy corpus """ 115 d = GoodmanDOP((Tree(malchapelitoj(a)) for a in train), rootsymbol='S', parser=BitParChartParser, n=100, unknownwords='unknownwords', openclassdfsa='pos.dfsa', name='syntax') 116 print "built syntax model" 117 118 mcorpus = map(malchapelitoj, open("morph.corp.txt").readlines()) 119 md = GoodmanDOP((cnf(Tree(a)) for a in mcorpus), rootsymbol='W', wrap=True, parser=BitParChartParser, n=100, unknownwords='unknownmorph', name='morphology') 120 print "built morphology model" 121 122 segmentd = dict(("".join(a), tuple(a)) for a in (Tree(a).leaves() for a in mcorpus)) 123 print "morphology exemplars: ", " ".join(segmentd.keys()) 124 print "segmentation dictionary size:", len(segmentd), 125 126 mlexicon = set(reduce(chain, segmentd.values())) 127 segmentd = dos1(set(segmentd.values())) 128 #restore original original words in case they were overwritten 129 for a in (Tree(a).leaves() for a in mcorpus): 130 segmentd["".join(a)] = tuple(a) 131 segment = segmentor(segmentd) 132 133 print "extrapolated:", len(segmentd) #, " ".join(segmentd.keys()) 134 from cPickle import dump 135 dump(segmentd, open('segmentd.pickle', 'wb'), protocol=-1) 136 137 print "analyzing morphology of treebank" 138 mtreebank = [] 139 for n, a in enumerate(Tree(a) for a in train): 140 print '%d / %d:' % (n, len(train)-1), 141 mtreebank.append(morphmerge(a, md, map(segment, a.leaves()))) 142 print 143 144 #mtreebank = [m(Tree(a)) for a in train] 145 #for a in mtreebank: print a 146 msd = GoodmanDOP(mtreebank, rootsymbol='S', parser=BitParChartParser, n=100, unknownwords='unknownmorph', name='morphsyntax') 147 print "built combined morphology-syntax model" 148 149 return d, md, msd, segment, mlexicon

150

151 -def toy():

152 #syntax treebank 153 from corpus import corpus 154 test = sample(corpus, int(0.1 * len(corpus))) 155 train = [a for a in corpus if a not in test] 156 d, md, msd, segment, lexicon = morphology(train) 157 158 #evaluation 159 for tree in (Tree(a) for a in []): #test 160 w = tree.leaves() 161 #morphology + syntax combined 162 try: 163 sent = list(reduce(chain, map(segment, w))) 164 print sent 165 print msd.removeids(msd.mostprobableparse(sent)) 166 except Exception as e: 167 print "error", e 168 169 #syntax & morphology separate 170 try: 171 print morphmerge(d.removeids(d.mostprobableparse(w)), md, map(segment, w)) 172 except Exception as e: 173 print "error:", e 174 175 def guess(w): 176 """ heuristics for a plausible morphological structure """ 177 if w[:3] == 'mal': 178 a = guess(w[3:]) 179 return "(%s (%s (A mal) %s) %s)" % (a[1:a.index(' ')], a[a.index(' ')+1:a.index(' ', 2)], a[a.index(' '):-1], a[:-2]) 180 if w[-1] == 'n': 181 a = guess(w[:-1]) 182 return "(%s %s n)" % (a[1:a.index(' ')], a) 183 if w[-1] == 'j': 184 a = guess(w[:-1]) 185 return "(%s %s j)" % (a[1:a.index(' ')], a) 186 if w[-1] == 'o': 187 return "(NN (N %s) o)" % w[:-1] 188 if w[-1] == 'a': 189 return "(JJ (J %s) a)" % w[:-1] 190 if w[-1] == 'e': 191 return "(RB (R %s) e)" % w[:-1] 192 if w[-1] == 'i': 193 return "(VB (V %s) i)" % w[:-1] 194 if w[-1] == 's': 195 return "(VB (V %s) %s)" % (w[:-2], w[-2:]) 196 return "(%s %s)" % (w, w)

197 198 for a in open("fundamento.vocab"): 199 if len(a) <= 1: continue 200 try: s = segment(a[:-1].lower()) 201 except: 202 s = None 203 print "( %s)" % a[:-1].lower() 204 if s and all(a in lexicon for a in s): 205 try: print md.removeids(md.mostprobableparse(s))[0] 206 except: 207 try: print guess(a[:-1]) 208 except: print "( %s)" % a[:-1].lower() 209 else: 210 try: print guess(a[:-1]) 211 except: print "( %s)" % a[:-1].lower() 212

213 -def interface():

214 from corpus import corpus 215 d, md, msd, segment, lexicon = morphology(corpus) 216 217 #print d.grammar 218 w = "foo!" 219 220 # basic REPL 221 while w: 222 print "sentence:", 223 w = raw_input().split() 224 if not w: break #quit 225 226 print "morphology:" 227 for a in w: 228 try: 229 print a, md.mostprobableparse(segment(a))[0] 230 except Exception as e: 231 print "error:", e 232 233 print "morphology + syntax combined:" 234 try: 235 sent = list(reduce(chain, map(segment, w))) 236 print sent 237 print msd.removeids(msd.mostprobableparse(sent)) 238 #for tree in d.parser.nbest_parse(w): 239 # print tree 240 except Exception as e: 241 print "error", e 242 243 print "syntax & morphology separate:" 244 #try: 245 if 1: 246 #TODO?: d.parse(w) should backoff to POS supplied by morphology for 247 #unknown words; but bitpar already does this with its word class 248 #automata support 249 print morphmerge(d.removeids(d.parse(w)), md, map(segment, w))

250 #sent = ["".join(a.split('|')) for a in w] 251 #for tree in d.parser.nbest_parse(w): 252 # print tree 253 #except Exception as e: 254 # print "error:", e 255

256 -def monato():

257 """ produce the goodman reduction of the full monato corpus """ 258 # turn cleanup off so that the grammar will not be removed 259 d = GoodmanDOP((stripfunc(cnf(Tree(a.lower()))) for a in open("arbobanko.train")), rootsymbol='top', parser=BitParChartParser, name='syntax', cleanup=False) 260 test = ("%s\n" % "\n".join(Tree(a.lower()).leaves()) for a in open("arbobanko.gold")) 261 open("arbobanko.test", "w").writelines(test)

262 #d = GoodmanDOP((Tree(a) for a in corpus), rootsymbol='S', wrap=True) 263 264 if __name__ == '__main__': 265 import doctest 266 # do doctests, but don't be pedantic about whitespace (I suspect it is the 267 # militant anti-tab faction who are behind this obnoxious default) 268 fail, attempted = doctest.testmod(verbose=False, 269 optionflags=doctest.NORMALIZE_WHITESPACE | doctest.ELLIPSIS) 270 if attempted and not fail: 271 print "%d doctests succeeded!" % attempted 272 interface() #interactive demo with toy corpus 273 #toy() #get toy corpus DOP reduction 274 #monato() #get monato DOP reduction 275

Source Code for Module morph