1
2
3 """ An application of Data-Oriented Parsing to Esperanto.
4 Combines a syntax and a morphology corpus. """
5
6 from dopg import *
7 from nltk import UnsortedChartParser, NgramModel
8 from bitpar import BitParChartParser
9 from random import sample,seed
10 seed()
11
13 return unicode(word).replace(u'cx',u'ĉ').replace(u'gx',u'ĝ').replace(u'hx',
14 u'ĥ').replace(u'jx',u'ĵ').replace(u'sx',u'ŝ').replace(u'ux',u'ŭ')
15
17 return unicode(word).replace(u'ĉ', u'cx').replace(u'ĝ', u'gx').replace(u'ĥ',
18 u'hx').replace(u'ĵ', u'jx').replace(u'ŝ', u'sx').replace(u'ŭ', u'ux')
19
21 """ make sure all terminals have POS tags;
22 invent one if necessary ("parent_word") """
23 result = tree.copy(True)
24 for a in tree.treepositions('leaves'):
25 if len(tree[a[:-1]]) != 1:
26 result[a] = Tree("%s_%s" % (tree[a[:-1]].node, tree[a]), [tree[a]])
27 return result
28
30 """ strip all function labels from a tree with labels of
31 the form "function:form", eg. S:np for subject, np. """
32 for a in tree.treepositions():
33 if isinstance(tree[a], Tree) and ':' in tree[a].node:
34 tree[a].node = tree[a].node.split(':')[1]
35 return tree
36
38 """ `Data-Oriented Segmentation 1': given a sequence of segmented words
39 (ie., a sequence of morphemes), produce a dictionary with extrapolated
40 segmentations (mapping words to sequences of morphemes).
41 Assumes non-ambiguity.
42 Method: cartesian product of all possible morphemes at position 0..n, where n is maximum word length."""
43 l = [len(a) for a in words]
44 morph_at = dict((x, set(a[x] for a,n in zip(words, l) if n > x))
45 for x in range(0, max(l)))
46 return dict(("".join(a), a) for a in
47 reduce(chain, (cartpi([morph_at[x] for x in range(n)])
48 for n in range(min(l), max(l)))))
50 """ `Data-Oriented Segmentation 2': given a sequence of segmented words
51 (ie., a sequence of morphemes), produce a dictionary with extrapolated
52 segmentations (mapping words to sequences of morphemes).
53 Discards ambiguous results.
54 Method: cartesian product of all words with the same number of morphemes. """
55 l = [len(a) for a in words]
56 return dict(("".join(a), a) for a in
57 reduce(chain, (cartpi(zip(*(w for w, m in zip(words, l) if m == n)))
58 for n in range(min(l), max(l)))))
60
61
62
63
64
65
66 model = NgramModel(2, words)
67 lengths = map(len, words)
68
69 for n in range(2, max(lengths)+1):
70
71 for m in lengths.count(n):
72 yield model.generate(n)
73
77
79 """ wrap a segmentation dictionary in a naive unknown word
80 segmentation function with some heuristics
81 (phonological rules could probably improve this further) """
82
83
84
85 def s(segmentd):
86 def f(w):
87 """ consult segmentation dictionary with fallback to rule-based heuristics. """
88 try: return segmentd[w]
89
90 except KeyError:
91 if w[-1] in 'jn': return f(w[:-1]) + (w[-1],)
92 if w[-1] in 'oaeu': return (w[:-1], w[-1])
93 if w[-1] == 's': return (w[:-2], w[-2:])
94 if w[-1] == 'i': return (w[:-1], w[-1])
95
96 return (w,)
97 return f
98 return s(segmentd)
99
101 """ merge morphology into phrase structure tree """
102 copy = tree.copy(True)
103 for a,w in zip(tree.treepositions('leaves'), segmented):
104 try:
105
106
107 copy[a[:-1]] = md.mostprobableparse(w)[0]
108 except Exception as e:
109 print "word:", tree[a[:-1]][0], "segmented", w
110 print "error:", e
111 return copy
112
114 """ an interactive interface to the toy corpus """
115 d = GoodmanDOP((Tree(malchapelitoj(a)) for a in train), rootsymbol='S', parser=BitParChartParser, n=100, unknownwords='unknownwords', openclassdfsa='pos.dfsa', name='syntax')
116 print "built syntax model"
117
118 mcorpus = map(malchapelitoj, open("morph.corp.txt").readlines())
119 md = GoodmanDOP((cnf(Tree(a)) for a in mcorpus), rootsymbol='W', wrap=True, parser=BitParChartParser, n=100, unknownwords='unknownmorph', name='morphology')
120 print "built morphology model"
121
122 segmentd = dict(("".join(a), tuple(a)) for a in (Tree(a).leaves() for a in mcorpus))
123 print "morphology exemplars: ", " ".join(segmentd.keys())
124 print "segmentation dictionary size:", len(segmentd),
125
126 mlexicon = set(reduce(chain, segmentd.values()))
127 segmentd = dos1(set(segmentd.values()))
128
129 for a in (Tree(a).leaves() for a in mcorpus):
130 segmentd["".join(a)] = tuple(a)
131 segment = segmentor(segmentd)
132
133 print "extrapolated:", len(segmentd)
134 from cPickle import dump
135 dump(segmentd, open('segmentd.pickle', 'wb'), protocol=-1)
136
137 print "analyzing morphology of treebank"
138 mtreebank = []
139 for n, a in enumerate(Tree(a) for a in train):
140 print '%d / %d:' % (n, len(train)-1),
141 mtreebank.append(morphmerge(a, md, map(segment, a.leaves())))
142 print
143
144
145
146 msd = GoodmanDOP(mtreebank, rootsymbol='S', parser=BitParChartParser, n=100, unknownwords='unknownmorph', name='morphsyntax')
147 print "built combined morphology-syntax model"
148
149 return d, md, msd, segment, mlexicon
150
152
153 from corpus import corpus
154 test = sample(corpus, int(0.1 * len(corpus)))
155 train = [a for a in corpus if a not in test]
156 d, md, msd, segment, lexicon = morphology(train)
157
158
159 for tree in (Tree(a) for a in []):
160 w = tree.leaves()
161
162 try:
163 sent = list(reduce(chain, map(segment, w)))
164 print sent
165 print msd.removeids(msd.mostprobableparse(sent))
166 except Exception as e:
167 print "error", e
168
169
170 try:
171 print morphmerge(d.removeids(d.mostprobableparse(w)), md, map(segment, w))
172 except Exception as e:
173 print "error:", e
174
175 def guess(w):
176 """ heuristics for a plausible morphological structure """
177 if w[:3] == 'mal':
178 a = guess(w[3:])
179 return "(%s (%s (A mal) %s) %s)" % (a[1:a.index(' ')], a[a.index(' ')+1:a.index(' ', 2)], a[a.index(' '):-1], a[:-2])
180 if w[-1] == 'n':
181 a = guess(w[:-1])
182 return "(%s %s n)" % (a[1:a.index(' ')], a)
183 if w[-1] == 'j':
184 a = guess(w[:-1])
185 return "(%s %s j)" % (a[1:a.index(' ')], a)
186 if w[-1] == 'o':
187 return "(NN (N %s) o)" % w[:-1]
188 if w[-1] == 'a':
189 return "(JJ (J %s) a)" % w[:-1]
190 if w[-1] == 'e':
191 return "(RB (R %s) e)" % w[:-1]
192 if w[-1] == 'i':
193 return "(VB (V %s) i)" % w[:-1]
194 if w[-1] == 's':
195 return "(VB (V %s) %s)" % (w[:-2], w[-2:])
196 return "(%s %s)" % (w, w)
197
198 for a in open("fundamento.vocab"):
199 if len(a) <= 1: continue
200 try: s = segment(a[:-1].lower())
201 except:
202 s = None
203 print "( %s)" % a[:-1].lower()
204 if s and all(a in lexicon for a in s):
205 try: print md.removeids(md.mostprobableparse(s))[0]
206 except:
207 try: print guess(a[:-1])
208 except: print "( %s)" % a[:-1].lower()
209 else:
210 try: print guess(a[:-1])
211 except: print "( %s)" % a[:-1].lower()
212
214 from corpus import corpus
215 d, md, msd, segment, lexicon = morphology(corpus)
216
217
218 w = "foo!"
219
220
221 while w:
222 print "sentence:",
223 w = raw_input().split()
224 if not w: break
225
226 print "morphology:"
227 for a in w:
228 try:
229 print a, md.mostprobableparse(segment(a))[0]
230 except Exception as e:
231 print "error:", e
232
233 print "morphology + syntax combined:"
234 try:
235 sent = list(reduce(chain, map(segment, w)))
236 print sent
237 print msd.removeids(msd.mostprobableparse(sent))
238
239
240 except Exception as e:
241 print "error", e
242
243 print "syntax & morphology separate:"
244
245 if 1:
246
247
248
249 print morphmerge(d.removeids(d.parse(w)), md, map(segment, w))
250
251
252
253
254
255
257 """ produce the goodman reduction of the full monato corpus """
258
259 d = GoodmanDOP((stripfunc(cnf(Tree(a.lower()))) for a in open("arbobanko.train")), rootsymbol='top', parser=BitParChartParser, name='syntax', cleanup=False)
260 test = ("%s\n" % "\n".join(Tree(a.lower()).leaves()) for a in open("arbobanko.gold"))
261 open("arbobanko.test", "w").writelines(test)
262
263
264 if __name__ == '__main__':
265 import doctest
266
267
268 fail, attempted = doctest.testmod(verbose=False,
269 optionflags=doctest.NORMALIZE_WHITESPACE | doctest.ELLIPSIS)
270 if attempted and not fail:
271 print "%d doctests succeeded!" % attempted
272 interface()
273
274
275