1
2 """ Shell interface to bitpar, an efficient chart parser for (P)CFGs.
3 Expects bitpar to be compiled and available in the PATH.
4 Currently only yields the one best parse tree without its probability.
5 Todo:
6 - yield n best parses with probabilites (parameter)
7 - parse chart output"""
8 from collections import defaultdict
9 from subprocess import Popen, PIPE
10 from uuid import uuid1
11 from nltk import Tree, ProbabilisticTree, FreqDist, InsideChartParser
12
14 - def __init__(self, weightedrules=None, lexicon=None, rootsymbol=None, unknownwords=None, openclassdfsa=None, cleanup=True, n=10, name=''):
15 """ Interface to bitpar chart parser. Expects a list of weighted
16 productions with frequencies (not probabilities).
17
18 @param weightedrules: sequence of tuples with strings
19 (lhs and rhs separated by tabs, eg. "S NP VP") and
20 frequencies. The reason we use this format is that
21 it is close to bitpar's file format; converting a
22 weighted grammar with probabilities to frequencies
23 would be a detour, and bitpar wants frequencies so
24 it can do smoothing.
25 @param lexicon: set of strings belonging to the lexicon
26 (ie., the set of terminals)
27 @param rootsymbol: starting symbol for the grammar
28 @param unknownwords: a file with a list of open class POS tags
29 with frequencies
30 @param openclassdfsa: a deterministic finite state automaton,
31 refer to the bitpar manpage.
32 @param cleanup: boolean, when set to true the grammar files will be
33 removed when the BitParChartParser object is deleted.
34 @param name: filename of grammar files in case you want to export it,
35 if not given will default to a unique identifier
36 @param n: the n best parse trees will be requested
37 >>> wrules = ( ("S\\tNP\\tVP", 1), \
38 ("NP\\tmary", 1), \
39 ("VP\\twalks", 1) )
40 >>> p = BitParChartParser(wrules, set(("mary","walks")))
41 >>> tree = p.parse("mary walks".split())
42 >>> print tree
43 (S (NP mary) (VP walks)) (p=1.0)
44
45 >>> from dopg import GoodmanDOP
46 >>> d = GoodmanDOP([tree], parser=InsideChartParser)
47 >>> d.parser.parse("mary walks".split())
48 ProbabilisticTree('S', [ProbabilisticTree('NP@1', ['mary'])
49 (p=1.0), ProbabilisticTree('VP@2', ['walks']) (p=1.0)])
50 (p=0.444444444444)
51 >>> d.parser.nbest_parse("mary walks".split(), 10)
52 [ProbabilisticTree('S', [ProbabilisticTree('NP@1', ['mary']) (p=1.0),
53 ProbabilisticTree('VP@2', ['walks']) (p=1.0)]) (p=0.444444444444),
54 ProbabilisticTree('S', [ProbabilisticTree('NP', ['mary']) (p=1.0),
55 ProbabilisticTree('VP@2', ['walks']) (p=1.0)]) (p=0.222222222222),
56 ProbabilisticTree('S', [ProbabilisticTree('NP@1', ['mary']) (p=1.0),
57 ProbabilisticTree('VP', ['walks']) (p=1.0)]) (p=0.222222222222),
58 ProbabilisticTree('S', [ProbabilisticTree('NP', ['mary']) (p=1.0),
59 ProbabilisticTree('VP', ['walks']) (p=1.0)]) (p=0.111111111111)]
60
61 >>> d = GoodmanDOP([tree], parser=BitParChartParser)
62 writing grammar
63 >>> d.parser.parse("mary walks".split())
64 ProbabilisticTree('S', [Tree('NP@1', ['mary']), Tree('VP@2', ['walks'])]) (p=0.444444)
65 >>> list(d.parser.nbest_parse("mary walks".split()))
66 [ProbabilisticTree('S', [Tree('NP@1', ['mary']), Tree('VP@2', ['walks'])])
67 (p=0.444444),
68 ProbabilisticTree('S', [Tree('NP', ['mary']), Tree('VP@2', ['walks'])])
69 (p=0.222222),
70 ProbabilisticTree('S', [Tree('NP@1', ['mary']), Tree('VP', ['walks'])])
71 (p=0.222222),
72 ProbabilisticTree('S', [Tree('NP', ['mary']), Tree('VP', ['walks'])])
73 (p=0.111111)]
74
75 TODO: parse bitpar's chart output / parse forest
76 """
77
78 self.grammar = weightedrules
79 self.lexicon = lexicon
80 self.rootsymbol = rootsymbol
81 self.name = name
82 if name: self.id = name
83 else: self.id = uuid1()
84 self.cleanup = cleanup
85 self.n = n
86 self.unknownwords = unknownwords
87 self.openclassdfsa = openclassdfsa
88 if weightedrules and lexicon:
89 self.writegrammar('/tmp/g%s.pcfg' % self.id, '/tmp/g%s.lex' % self.id)
90 elif not name:
91 raise ValueError("need grammar or file name")
92 self.start()
93
95 cmd = "rm /tmp/g%s.pcfg /tmp/g%s.lex" % (self.id, self.id)
96 if self.cleanup: Popen(cmd.split())
97 self.stop()
98
100
101 options = "bitpar -q -b %d -vp -p " % (self.n)
102
103
104 if self.rootsymbol: options += "-s %s " % self.rootsymbol
105 if self.unknownwords: options += "-u %s " % self.unknownwords
106 if self.openclassdfsa: options += "-w %s " % self.openclassdfsa
107 if self.name:
108 options += "%s.pcfg %s.lex" % (self.id, self.id)
109 else:
110 options += "/tmp/g%s.pcfg /tmp/g%s.lex" % (self.id, self.id)
111
112 self.bitpar = Popen(options.split(), stdin=PIPE, stdout=PIPE, stderr=PIPE)
113
115 if not isinstance(self.bitpar.poll(), int):
116 self.bitpar.terminate()
117
119 if isinstance(self.bitpar.poll(), int): self.start()
120 result, stderr = self.bitpar.communicate("%s\n\n" % "\n".join(sent))
121 if not "=" in result:
122
123 raise ValueError("no output. stdout: \n%s\nstderr:\n%s " % (result.strip(), stderr.strip()))
124 prob, tree = result.split("\n", 1)[0], result.split("\n", 2)[1]
125 prob = float(prob.split("=")[1])
126 tree = Tree(tree)
127 return ProbabilisticTree(tree.node, tree, prob=prob)
128
130 """ n has to be specified in the constructor because it is specified
131 as a command line parameter to bitpar, allowing it here would require
132 potentially expensive restarts of bitpar. """
133 if isinstance(self.bitpar.poll(), int): self.start()
134 result, stderr = self.bitpar.communicate("%s\n\n" % "\n".join(sent))
135 results = result.split("\n")[:-1]
136 probs = (float(a.split("=")[1]) for a in results[::2] if "=" in a)
137 trees = (Tree(a) for a in results[1::2])
138 return (ProbabilisticTree(b.node, b, prob=a) for a, b in zip(probs, trees))
139
141 """ write a grammar to files f and l in a format that bitpar
142 understands. f will contain the grammar rules, l the lexicon
143 with pos tags. """
144 f, l = open(f, 'w'), open(l, 'w')
145 lex = defaultdict(list)
146 lex = defaultdict(FreqDist)
147 def process():
148 for rule, freq in self.grammar:
149 lhs, rhs = rule.split('\t',1)[0], rule.split('\t')[1:]
150
151 if rhs[0] in self.lexicon:
152
153 lex[rhs[0]].inc(lhs)
154
155 elif len(rhs) == 0 or '' in (str(a).strip() for a in rhs): continue
156 else:
157
158
159 yield "%s\t%s\n" % (repr(freq), rule)
160
161
162
163 f.writelines(process())
164 def proc(lex):
165 for word, tags in lex.items():
166
167
168 yield "%s\t%s\n" % (word, "\t".join(' '.join(map(str, a)) for a in tags.items()))
169 l.writelines(proc(lex))
170 f.close()
171 l.close()
172
173 if __name__ == '__main__':
174 import doctest
175
176
177 fail, attempted = doctest.testmod(verbose=False,
178 optionflags=doctest.NORMALIZE_WHITESPACE | doctest.ELLIPSIS)
179 if attempted and not fail:
180 print "%d doctests succeeded!" % attempted
181