1
2 """DOP1 implementation. Andreas van Cranenburgh <andreas@unstable.nl>
3 TODOs: unicode support (replace str and repr calls)"""
4
5
6
7 from collections import defaultdict
8 from itertools import chain, count
9
10 from nltk import Production, WeightedProduction, WeightedGrammar, FreqDist
11 from nltk import Tree, ImmutableTree, Nonterminal, InsideChartParser, UnsortedChartParser
12 from bitpar import BitParChartParser
13
15 """ cartesian product of two sequences """
16 for x in a:
17 for y in b:
18 yield x, y
19
21 """ produce a flattened fold using cartesian product as operator
22
23 >>> list(cartpi( ( (1,2), (3,4), (5,6) ) ) )
24 [(1, 3, 5), (2, 3, 5), (1, 4, 5), (2, 4, 5), (1, 3, 6), (2, 3, 6),
25 (1, 4, 6), (2, 4, 6)] """
26 if len(seq) == 0: return ((), )
27 else: return ((a,) + b for b in cartpi(seq[1:]) for a in seq[0])
28
29
31 """
32 Generate the productions that correspond to the non-terminal nodes of the tree.
33 For each subtree of the form (P: C1 C2 ... Cn) this produces a production of the
34 form P -> C1 C2 ... Cn.
35 @rtype: list of C{Production}s
36 """
37
38 if not (isinstance(tree.node, str) or isinstance(tree.node, unicode)):
39 raise TypeError, 'Productions can only be generated from trees having node labels that are strings'
40
41 prods = [Production(Nonterminal(tree.node), tree._child_names())]
42 for child in tree:
43 if isinstance(child, Tree):
44 prods += productions(child)
45 return prods
46
48 - def __init__(self, treebank, rootsymbol='S', wrap=False, cnf=True, cleanup=True, parser=InsideChartParser, **parseroptions):
49 """ initialize a DOP model given a treebank. uses the Goodman
50 reduction of a STSG to a PCFG. after initialization,
51 self.parser will contain an InsideChartParser.
52
53 >>> tree = Tree("(S (NP mary) (VP walks))")
54 >>> d = GoodmanDOP([tree])
55 >>> print d.grammar
56 Grammar with 12 productions (start state = S)
57 NP -> 'mary' [1.0]
58 NP@1 -> 'mary' [1.0]
59 S -> NP VP [0.25]
60 S -> NP VP@2 [0.25]
61 S -> NP@1 VP [0.25]
62 S -> NP@1 VP@2 [0.25]
63 S@0 -> NP VP [0.25]
64 S@0 -> NP VP@2 [0.25]
65 S@0 -> NP@1 VP [0.25]
66 S@0 -> NP@1 VP@2 [0.25]
67 VP -> 'walks' [1.0]
68 VP@2 -> 'walks' [1.0]
69 >>> print d.parser.parse("mary walks".split())
70 (S (NP mary) (VP@2 walks)) (p=0.25)
71
72 @param treebank: a list of Tree objects. Caveat lector:
73 terminals may not have (non-terminals as) siblings.
74 @param wrap: boolean specifying whether to add the start symbol
75 to each tree
76 @param parser: a class which will be instantiated with the DOP
77 model as its grammar. Support BitParChartParser.
78
79 instance variables:
80 - self.grammar a WeightedGrammar containing the PCFG reduction
81 - self.fcfg a list of strings containing the PCFG reduction
82 with frequencies instead of probabilities
83 - self.parser an InsideChartParser object
84 - self.exemplars dictionary of known parse trees (memoization)"""
85 nonterminalfd, ids = FreqDist(), count()
86 cfg = FreqDist()
87 self.exemplars = {}
88 if wrap:
89
90 treebank = [Tree(rootsymbol, [a]) for a in treebank]
91 if cnf:
92 for a in treebank:
93 a.chomsky_normal_form()
94
95 utreebank = list((tree, self.decorate_with_ids(tree, ids)) for tree in treebank)
96 lexicon = set(reduce(chain, (a.leaves() for a,b in utreebank)))
97
98 for tree,utree in utreebank:
99
100 self.nodefreq(tree, nonterminalfd)
101 self.nodefreq(utree, nonterminalfd)
102
103
104
105
106
107 if type(parser) == type(BitParChartParser):
108
109 cfg = FreqDist(reduce(chain, (self.goodman(tree, utree) for tree, utree in utreebank)))
110
111 self.fcfg = self.frequencies(cfg, nonterminalfd)
112 print "writing grammar"
113 self.parser = BitParChartParser(self.fcfg, lexicon, rootsymbol, cleanup=False, **parseroptions)
114 else:
115 cfg = FreqDist(reduce(chain, (self.goodman(tree, utree, False) for tree, utree in utreebank)))
116 probs = self.probabilities(cfg, nonterminalfd)
117
118 self.grammar = WeightedGrammar(Nonterminal(rootsymbol), probs)
119
120 self.parser = InsideChartParser(self.grammar)
121
122
123
124
125
126
127
128
129
130
131
132
133 del cfg, nonterminalfd
134
135 - def goodmanfd(self, tree, ids, nonterminalfd):
140
142 """ add unique identifiers to each non-terminal of a tree.
143
144 >>> tree = Tree("(S (NP mary) (VP walks))")
145 >>> d = GoodmanDOP([tree])
146 >>> d.decorate_with_ids(tree, count())
147 Tree('S@0', [Tree('NP@1', ['mary']), Tree('VP@2', ['walks'])])
148
149 @param ids: an iterator yielding a stream of IDs"""
150 utree = tree.copy(True)
151 for a in utree.subtrees():
152 a.node = "%s@%d" % (a.node, ids.next())
153 return utree
154
155 - def nodefreq(self, tree, nonterminalfd, leaves=1):
156 """count frequencies of nodes by calculating the number of
157 subtrees headed by each node. updates "nonterminalfd" as
158 a side effect
159
160 >>> fd = FreqDist()
161 >>> tree = Tree("(S (NP mary) (VP walks))")
162 >>> d = GoodmanDOP([tree])
163 >>> d.nodefreq(tree, fd)
164 4
165 >>> fd.items()
166 [('S', 4), ('NP', 1), ('VP', 1)]
167
168 #[('S', 9), ('NP', 2), ('VP', 2), ('mary', 1), ('walks', 1)]
169
170 @param nonterminalfd: the FreqDist to store the counts in."""
171 if isinstance(tree, Tree) and len(tree) > 0 and tree.height() > 2:
172 n = reduce((lambda x,y: x*y),
173 (self.nodefreq(x, nonterminalfd) + 1 for x in tree))
174 nonterminalfd.inc(tree.node, count=n)
175
176 return n
177 elif tree.height() == 2:
178
179 nonterminalfd.inc(tree.node, count=len(tree))
180
181 return 1
182
183 - def goodman(self, tree, utree, bitparfmt=True):
184 """ given a parsetree from a treebank, yield a goodman
185 reduction of eight rules per node (in the case of a binary tree).
186
187 >>> tree = Tree("(S (NP mary) (VP walks))")
188 >>> d = GoodmanDOP([tree])
189 >>> utree = d.decorate_with_ids(tree, count())
190 >>> sorted(d.goodman(tree, utree, False))
191 [(NP, ('mary',)), (NP@1, ('mary',)), (S, (NP, VP)), (S, (NP, VP@2)),
192 (S, (NP@1, VP)), (S, (NP@1, VP@2)), (S@0, (NP, VP)),
193 (S@0, (NP, VP@2)), (S@0, (NP@1, VP)), (S@0, (NP@1, VP@2)),
194 (VP, ('walks',)), (VP@2, ('walks',))]
195 """
196
197 sep = "\t"
198 for p, up in zip(tree.productions(), utree.productions()):
199
200 if len(p.rhs()) == 0: raise ValueError
201 if len(p.rhs()) == 1:
202 if not isinstance(p.rhs()[0], Nonterminal): rhs = (p.rhs(), )
203 else: rhs = (p.rhs(), up.rhs())
204
205 else:
206 if all(isinstance(a, Nonterminal) for a in up.rhs()):
207 rhs = set(cartpi(zip(p.rhs(), up.rhs())))
208 else: rhs = cartpi(zip(p.rhs(), up.rhs()))
209
210
211
212 for l, r in cartprod((p.lhs(), up.lhs()), rhs):
213
214 if bitparfmt:
215 yield sep.join((str(l), sep.join(map(str, r))))
216 else:
217 yield l, r
218
219
220
221
222
223
225 """merge cfg and frequency distribution into a pcfg with the right
226 probabilities.
227
228 @param cfg: a list of Productions
229 @param nonterminalfd: a FreqDist of (non)terminals (with and
230 without IDs)"""
231
232 def prob(l, r):
233
234
235 return reduce((lambda x,y: x*y), map((lambda z: '@' in str(z)
236 and fd[str(z)] or 1), r)) / float(fd[str(l)])
237
238
239
240
241
242
243
244
245
246 return [WeightedProduction(l, r, prob=freq*prob(l, r)) for (l,r),freq in cfg.items()]
247
248
249
251 """merge cfg and frequency distribution into a list of weighted
252 productions with frequencies as weights (as expected by bitpar).
253
254 @param cfg: a list of Productions
255 @param nonterminalfd: a FreqDist of (non)terminals (with and
256 without IDs)"""
257 def prob(r):
258 return reduce((lambda x,y: x*y), map((lambda z: '@' in str(z)
259 and fd[str(z)] or 1), r), 1)
260
261
262
263
264
265
266 return ((rule, freq * reduce((lambda x,y: x*y), map((lambda z: '@' in str(z) and fd[str(z)] or 1), rule.split('\t')[1:]), 1)) for rule, freq in cfg.items())
267
268
269
271 """ remove unique IDs introduced by the Goodman reduction """
272 for a in tree.treepositions():
273 if '@' in str(tree[a]):
274 tree[a].node = tree[a].node.split('@')[0]
275 return tree
276
278 """ memoize parse trees. TODO: maybe add option to add every
279 parse tree to the set of exemplars, ie., incremental learning.
280 this uses the most probable derivation (not very good)."""
281 try:
282 return self.exemplars[tuple(sent)]
283 except KeyError:
284 self.exemplars[tuple(sent)] = self.parser.parse(sent)
285 return self.exemplars[tuple(sent)]
286
288 """ warning: this problem is NP-complete. using an unsorted
289 chart parser avoids unnecessary sorting (since we need all
290 derivations anyway).
291
292 @param sent: a sequence of terminals
293 @param sample: None or int; if int then sample that many parses"""
294 p = FreqDist()
295 for a in self.parser.nbest_parse(sent, sample):
296 p.inc(ImmutableTree.convert(self.removeids(a)), a.prob())
297 if p.max(): return p.max()
298 else: raise ValueError("no parse")
299
301 """ not working yet. almost verbatim translation of Goodman's (1996)
302 most constituents correct parsing algorithm, except for python's
303 zero-based indexing. needs to be modified to return the actual parse
304 tree. expects a pcfg in the form of a dictionary from productions to
305 probabilities """
306 def g(s, t, x):
307 def f(s, t, x):
308 return self.pcfg[Production(rootsymbol,
309 sent[1:s] + [x] + sent[s+1:])]
310 def e(s, t, x):
311 return self.pcfg[Production(x, sent[s:t+1])]
312 return f(s, t, x) * e(s, t, x ) / e(1, n, rootsymbol)
313
314 sumx = defaultdict(int)
315 maxc = defaultdict(int)
316 for length in range(2, len(sent)+1):
317 for s in range(1, len(sent) + length):
318 t = s + length - 1
319 for x in self.nonterminals:
320 sumx[x] = g(s, t, x)
321 for k in range(self.addresses):
322
323 x = self.nonterminal[k]
324 sumx[x] += g(s, t, "%s@%d" % (x, k))
325 max_x = max(sumx[x] for x in self.nonterminals)
326
327
328 best_split = max(maxc[(s,r)] + maxc[(r+1,t)]
329 for r in range(s, t))
330
331
332 maxc[(s,t)] = sumx(max_x) + best_split
333
334 return maxc[(1, len(sent) + 1)]
335
337 """ a basic REPL for testing """
338 corpus = """(S (NP John) (VP (V likes) (NP Mary)))
339 (S (NP Peter) (VP (V hates) (NP Susan)))
340 (S (NP Harry) (VP (V eats) (NP pizza)))
341 (S (NP Hermione) (VP (V eats)))""".splitlines()
342 corpus ="""(S (NP (DT The) (NN cat)) (VP (VBP saw) (NP (DT the) (JJ hungry) (NN dog))))
343 (S (NP (DT The) (JJ little) (NN mouse)) (VP (VBP ate) (NP (DT the) (NN cat))))""".splitlines()
344
345
346 corpus = [Tree(a) for a in corpus]
347 for a in corpus:
348
349 a.chomsky_normal_form()
350
351 d = GoodmanDOP(corpus, rootsymbol='TOP', wrap='TOP', parser=BitParChartParser)
352
353
354 from nltk import ImmutableTree
355 print "corpus"
356 for a in corpus: print a
357 w = "foo!"
358 while w:
359 print "sentence:",
360 w = raw_input().split()
361
362 try:
363 p = FreqDist()
364 for a in d.parser.nbest_parse(w):
365 print a
366 p.inc(ImmutableTree.convert(d.removeids(a)), a.prob())
367
368
369 print
370 print 'best', p.max()
371
372 except Exception as e:
373 print "error", e
374
375 if __name__ == '__main__':
376 import doctest
377
378
379 fail, attempted = doctest.testmod(verbose=False,
380 optionflags=doctest.NORMALIZE_WHITESPACE | doctest.ELLIPSIS)
381 if attempted and not fail:
382 print "%d doctests succeeded!" % attempted
383 main()
384