Module bitpar
[hide private]
[frames] | no frames]

Source Code for Module bitpar

  1  #!/usr/bin/python 
  2  """ Shell interface to bitpar, an efficient chart parser for (P)CFGs. 
  3  Expects bitpar to be compiled and available in the PATH. 
  4  Currently only yields the one best parse tree without its probability. 
  5  Todo:  
  6   - yield n best parses with probabilites (parameter) 
  7   - parse chart output""" 
  8  from collections import defaultdict 
  9  from subprocess import Popen, PIPE 
 10  from uuid import uuid1 
 11  from nltk import Tree, ProbabilisticTree, FreqDist, InsideChartParser 
 12   
13 -class BitParChartParser:
14 - def __init__(self, weightedrules=None, lexicon=None, rootsymbol=None, unknownwords=None, openclassdfsa=None, cleanup=True, n=10, name=''):
15 """ Interface to bitpar chart parser. Expects a list of weighted 16 productions with frequencies (not probabilities). 17 18 @param weightedrules: sequence of tuples with strings 19 (lhs and rhs separated by tabs, eg. "S NP VP") and 20 frequencies. The reason we use this format is that 21 it is close to bitpar's file format; converting a 22 weighted grammar with probabilities to frequencies 23 would be a detour, and bitpar wants frequencies so 24 it can do smoothing. 25 @param lexicon: set of strings belonging to the lexicon 26 (ie., the set of terminals) 27 @param rootsymbol: starting symbol for the grammar 28 @param unknownwords: a file with a list of open class POS tags 29 with frequencies 30 @param openclassdfsa: a deterministic finite state automaton, 31 refer to the bitpar manpage. 32 @param cleanup: boolean, when set to true the grammar files will be 33 removed when the BitParChartParser object is deleted. 34 @param name: filename of grammar files in case you want to export it, 35 if not given will default to a unique identifier 36 @param n: the n best parse trees will be requested 37 >>> wrules = ( ("S\\tNP\\tVP", 1), \ 38 ("NP\\tmary", 1), \ 39 ("VP\\twalks", 1) ) 40 >>> p = BitParChartParser(wrules, set(("mary","walks"))) 41 >>> tree = p.parse("mary walks".split()) 42 >>> print tree 43 (S (NP mary) (VP walks)) (p=1.0) 44 45 >>> from dopg import GoodmanDOP 46 >>> d = GoodmanDOP([tree], parser=InsideChartParser) 47 >>> d.parser.parse("mary walks".split()) 48 ProbabilisticTree('S', [ProbabilisticTree('NP@1', ['mary']) 49 (p=1.0), ProbabilisticTree('VP@2', ['walks']) (p=1.0)]) 50 (p=0.444444444444) 51 >>> d.parser.nbest_parse("mary walks".split(), 10) 52 [ProbabilisticTree('S', [ProbabilisticTree('NP@1', ['mary']) (p=1.0), 53 ProbabilisticTree('VP@2', ['walks']) (p=1.0)]) (p=0.444444444444), 54 ProbabilisticTree('S', [ProbabilisticTree('NP', ['mary']) (p=1.0), 55 ProbabilisticTree('VP@2', ['walks']) (p=1.0)]) (p=0.222222222222), 56 ProbabilisticTree('S', [ProbabilisticTree('NP@1', ['mary']) (p=1.0), 57 ProbabilisticTree('VP', ['walks']) (p=1.0)]) (p=0.222222222222), 58 ProbabilisticTree('S', [ProbabilisticTree('NP', ['mary']) (p=1.0), 59 ProbabilisticTree('VP', ['walks']) (p=1.0)]) (p=0.111111111111)] 60 61 >>> d = GoodmanDOP([tree], parser=BitParChartParser) 62 writing grammar 63 >>> d.parser.parse("mary walks".split()) 64 ProbabilisticTree('S', [Tree('NP@1', ['mary']), Tree('VP@2', ['walks'])]) (p=0.444444) 65 >>> list(d.parser.nbest_parse("mary walks".split())) 66 [ProbabilisticTree('S', [Tree('NP@1', ['mary']), Tree('VP@2', ['walks'])]) 67 (p=0.444444), 68 ProbabilisticTree('S', [Tree('NP', ['mary']), Tree('VP@2', ['walks'])]) 69 (p=0.222222), 70 ProbabilisticTree('S', [Tree('NP@1', ['mary']), Tree('VP', ['walks'])]) 71 (p=0.222222), 72 ProbabilisticTree('S', [Tree('NP', ['mary']), Tree('VP', ['walks'])]) 73 (p=0.111111)] 74 75 TODO: parse bitpar's chart output / parse forest 76 """ 77 78 self.grammar = weightedrules 79 self.lexicon = lexicon 80 self.rootsymbol = rootsymbol 81 self.name = name 82 if name: self.id = name 83 else: self.id = uuid1() 84 self.cleanup = cleanup 85 self.n = n 86 self.unknownwords = unknownwords 87 self.openclassdfsa = openclassdfsa 88 if weightedrules and lexicon: 89 self.writegrammar('/tmp/g%s.pcfg' % self.id, '/tmp/g%s.lex' % self.id) 90 elif not name: 91 raise ValueError("need grammar or file name") 92 self.start()
93
94 - def __del__(self):
95 cmd = "rm /tmp/g%s.pcfg /tmp/g%s.lex" % (self.id, self.id) 96 if self.cleanup: Popen(cmd.split()) 97 self.stop()
98
99 - def start(self):
100 # quiet, yield best parse, show viterbi prob., use frequencies 101 options = "bitpar -q -b %d -vp -p " % (self.n) 102 # if no rootsymbol is given, 103 # bitpar defaults to the first nonterminal in the rules 104 if self.rootsymbol: options += "-s %s " % self.rootsymbol 105 if self.unknownwords: options += "-u %s " % self.unknownwords 106 if self.openclassdfsa: options += "-w %s " % self.openclassdfsa 107 if self.name: 108 options += "%s.pcfg %s.lex" % (self.id, self.id) 109 else: 110 options += "/tmp/g%s.pcfg /tmp/g%s.lex" % (self.id, self.id) 111 #if self.debug: print options.split() 112 self.bitpar = Popen(options.split(), stdin=PIPE, stdout=PIPE, stderr=PIPE)
113
114 - def stop(self):
115 if not isinstance(self.bitpar.poll(), int): 116 self.bitpar.terminate()
117
118 - def parse(self, sent):
119 if isinstance(self.bitpar.poll(), int): self.start() 120 result, stderr = self.bitpar.communicate("%s\n\n" % "\n".join(sent)) 121 if not "=" in result: 122 # bitpar returned some error or didn't produce output 123 raise ValueError("no output. stdout: \n%s\nstderr:\n%s " % (result.strip(), stderr.strip())) 124 prob, tree = result.split("\n", 1)[0], result.split("\n", 2)[1] 125 prob = float(prob.split("=")[1]) 126 tree = Tree(tree) 127 return ProbabilisticTree(tree.node, tree, prob=prob)
128
129 - def nbest_parse(self, sent, n_will_be_ignored=None):
130 """ n has to be specified in the constructor because it is specified 131 as a command line parameter to bitpar, allowing it here would require 132 potentially expensive restarts of bitpar. """ 133 if isinstance(self.bitpar.poll(), int): self.start() 134 result, stderr = self.bitpar.communicate("%s\n\n" % "\n".join(sent)) 135 results = result.split("\n")[:-1] #strip trailing blank line 136 probs = (float(a.split("=")[1]) for a in results[::2] if "=" in a) 137 trees = (Tree(a) for a in results[1::2]) 138 return (ProbabilisticTree(b.node, b, prob=a) for a, b in zip(probs, trees))
139
140 - def writegrammar(self, f, l):
141 """ write a grammar to files f and l in a format that bitpar 142 understands. f will contain the grammar rules, l the lexicon 143 with pos tags. """ 144 f, l = open(f, 'w'), open(l, 'w') 145 lex = defaultdict(list) 146 lex = defaultdict(FreqDist) 147 def process(): 148 for rule, freq in self.grammar: 149 lhs, rhs = rule.split('\t',1)[0], rule.split('\t')[1:] 150 #if len(rhs) == 1 and not isinstance(rhs[0], Nonterminal): 151 if rhs[0] in self.lexicon: 152 #lex[rhs[0]].append(" ".join(map(repr, (lhs, freq)))) 153 lex[rhs[0]].inc(lhs) 154 # this should NOT happen: (drop it like it's hot) 155 elif len(rhs) == 0 or '' in (str(a).strip() for a in rhs): continue #raise ValueError 156 else: 157 # prob^Wfrequency lhs rhs1 rhs2 158 #print "%s\t%s" % (repr(freq), rule) 159 yield "%s\t%s\n" % (repr(freq), rule)
160 #yield "%s\t%s\t%s\n" % (repr(freq), str(lhs), 161 # "\t".join(str(x) for x in rhs)) 162 #f.write(''.join(process())) 163 f.writelines(process()) 164 def proc(lex): 165 for word, tags in lex.items(): 166 # word POS1 prob^Wfrequency1 POS2 freq2 ... 167 #print "%s\t%s" % (word, "\t".join(' '.join(map(str, a)) for a in tags.items())) 168 yield "%s\t%s\n" % (word, "\t".join(' '.join(map(str, a)) for a in tags.items()))
169 l.writelines(proc(lex)) 170 f.close() 171 l.close() 172 173 if __name__ == '__main__': 174 import doctest 175 # do doctests, but don't be pedantic about whitespace (I suspect it is the 176 # militant anti-tab faction who are behind this obnoxious default) 177 fail, attempted = doctest.testmod(verbose=False, 178 optionflags=doctest.NORMALIZE_WHITESPACE | doctest.ELLIPSIS) 179 if attempted and not fail: 180 print "%d doctests succeeded!" % attempted 181