arbobanko

1 #!/usr/bin/python 2 """treebank conversion script; expects no arguments, uses stdin & stdout. 3 input is VISL horizontal tree format 4 see: http://beta.visl.sdu.dk/treebanks.html#The_source_format 5 output: s-expression, ie., tree in bracket notation. 6 TODO: turn this into a nltk.Corpus reader 7 """ 8 9 example = """X:np 10 =H:n("konsekvenco" <*> <ac> P NOM) Konsekvencoj 11 =DN:pp 12 ==H:prp("de") de 13 ==DP:np 14 ===DN:adj("ekonomia" <Deco> P NOM) ekonomiaj 15 ===H:n("transformo" P NOM) transformoj""" 16 17 example2 = """STA:fcl 18 =S:np 19 ==DN:pron-dem("tia" <*> <Dem> <Du> <dem> DET P NOM) Tiaj 20 ==H:n("akuzo" <act> <sd> P NOM) akuzoj 21 =fA:adv("certe") certe 22 =P:v-fin("dauxri" <va+TEMP> <mv> FUT VFIN) dauxros""" 23 24 example3 = """STA:par 25 =CJT:fcl 26 ==fA:adv("krome" <*>) Krome 27 ==, 28 ==S:np 29 ===DN:art("la") la 30 ===H:n("savo" <act> <event> S NOM) savo 31 ===DN:pp 32 ====H:prp("de") de 33 ====DP:np 34 =====H:n("konkuranto" <Hprof> S NOM) konkuranto 35 ==P:v-fin("helpi" <cjt-head> <vta+INF> <mv> FUT VFIN) helpos 36 =====DN:prop("Microsoft" <*> S NOM) Microsoft 37 =CJT:icl 38 ==P:v-pcp2("refi" <cjt-STA> <mv> PAS COND INF) refuti 39 ==Od:np 40 ===H:n("akuzo" <act> <sd> P ACC) akuzojn 41 ===DN:pp 42 ====H:prp("pri") pri 43 ====DP:n("monopolismo" S NOM) monopolismo""" 44 45 from nltk import Tree 46 from nltk.tokenize import word_tokenize, wordpunct_tokenize 47 from sys import stdin, stdout, stderr 48 import re 49 from itertools import chain 50

51 -def cnf(tree):

52 """ make sure all terminals have POS tags; 53 invent one if necessary ("parent_word") """ 54 result = tree.copy(True) 55 for a in tree.treepositions('leaves'): 56 if len(tree[a[:-1]]) != 1: 57 result[a] = Tree("%s_%s" % (tree[a[:-1]].node, tree[a]), [tree[a]]) 58 for a in tree.treepositions(): 59 if isinstance(tree[a], Tree) and len(tree[a]) == 0: 60 tree[a] += tree[a].node 61 # = Tree("%s_%s" % (tree[a[:-1]].node, tree[a]), [tree[a]]) 62 return result

63

64 -def leaves(xx):

65 """include "non-terminals" if they have no children""" 66 def node(a): 67 if isinstance(a, Tree): return a.node 68 else: return a

69 def splitword(a): 70 return node(a).split("_") 71 return list(reduce(chain, [splitword(xx[a]) for a in xx.treepositions() if (not isinstance(xx[a], Tree)) or len(xx[a]) == 0])) 72

73 -def parse(input, stripmorph=True):

74 """parse a horizontal tree into an s-expression (ie., WSJ format). 75 Defaults to stripping morphology information. 76 Parentheses in the input are converted to braces. 77 78 >>> print example 79 X:np 80 =H:n("konsekvenco" <*> <ac> P NOM) Konsekvencoj 81 =DN:pp 82 ==H:prp("de") de 83 ==DP:np 84 ===DN:adj("ekonomia" <Deco> P NOM) ekonomiaj 85 ===H:n("transformo" P NOM) transformoj 86 >>> parse(example.splitlines()) 87 '(X:np (H:n Konsekvencoj) (DN:pp (H:prp de) (DP:np (DN:adj ekonomiaj) (H:n transformoj))))' 88 >>> print example2 89 STA:fcl 90 =S:np 91 ==DN:pron-dem("tia" <*> <Dem> <Du> <dem> DET P NOM) Tiaj 92 ==H:n("akuzo" <act> <sd> P NOM) akuzoj 93 =fA:adv("certe") certe 94 =P:v-fin("dauxri" <va+TEMP> <mv> FUT VFIN) dauxros 95 96 >>> parse(example2.splitlines()) 97 '(STA:fcl (S:np (DN:pron-dem Tiaj) (H:n akuzoj)) (fA:adv certe) (P:v-fin dauxros))' 98 >>> parse(example3.splitlines()) 99 '(STA:par (CJT:fcl (fA:adv Krome) (,) (S:np (DN:art la) (H:n savo) (DN:pp (H:prp de) (DP:np (H:n konkuranto)))) (P:v-fin helpos (((DN:prop Microsoft))))) CJT:icl (P:v-pcp2 refuti) (Od:np (H:n akuzojn) (DN:pp (H:prp pri) (DP:n monopolismo))))' 100 """ 101 n = -1 102 open = 1 103 out = [] 104 for a in input: 105 if '("' in a: 106 w = True 107 else: w = False 108 if a.count('=') == n and n != 0: 109 out += ') (' 110 else: 111 if a.count('=') < n: 112 #out += "%s %s" % (')' * n, a.count('=') * '(') 113 out += ')' * (1 + n - a.count('=')) 114 open -= (1 + n - a.count('=')) 115 if a.count('=') > n: 116 out += ' (' * ((a.count('=') - n)) 117 open += (a.count('=') - n) 118 elif w: 119 out += ' (' 120 open += 1 121 n = a.count('=') 122 # remove morphology tags & lemma; replace other parentheses with braces because nltk.Tree gets confused 123 if stripmorph: 124 word = re.sub("=+|\(.*\)", "", a).replace('(','{').replace(')','}').split() 125 else: 126 word = a.replace('(','{').replace(')','}').split() 127 #every terminal should have a POS tag, be creative if it does not 128 if len(word) == 1: word == (word, word) 129 out += " " + " ".join(word) 130 131 out += (open - 1) * ')' 132 return "".join(out).replace('( ', '(')[1:]

133 134 135 relinelev = re.compile(r'(=*)(.*)') 136 reclean = re.compile(r'\s*\((\S+)[^)]*\)') 137

138 -def clean(a, stripmorph=True):

139 # remove morphology tags & lemma; replace other parentheses with braces because nltk.Tree gets confused 140 if stripmorph: 141 return " ".join(re.sub("\(.*\)", "", a).replace('(','{').replace(')','}').split()) 142 #return a.split()[-1] 143 #return reclean.sub(r'\1', a) #.replace('(','{').replace(')','}').split() 144 else: 145 return a.replace('(','{').replace(')','}').split()

146

147 -def reparse(tree):

148 """following code contributed by Alex Martelli at StackOverflow: 149 http://stackoverflow.com/questions/2815020/converting-a-treebank-of-vertical-trees-to-s-expressions 150 151 parse a horizontal tree into an s-expression (ie., WSJ format). 152 Defaults to stripping morphology information. 153 Parentheses in the input are converted to braces. 154 155 >>> reparse(example.splitlines()) 156 '(X:np (H:n Konsekvencoj) (DN:pp (H:prp de) (DP:np (DN:adj ekonomiaj) (H:n transformoj))))' 157 >>> reparse(example2.splitlines()) 158 '(STA:fcl (S:np (DN:pron-dem Tiaj) (H:n akuzoj)) (fA:adv certe) (P:v-fin dauxros))' 159 >>> reparse(example3.splitlines()) 160 '(STA:par (CJT:fcl (fA:adv Krome) (,) (S:np (DN:art la) (H:n savo) (DN:pp (H:prp de) (DP:np (H:n konkuranto)))) (P:v-fin helpos (DN:prop Microsoft))) (CJT:icl (P:v-pcp2 refuti) (Od:np (H:n akuzojn) (DN:pp (H:prp pri) (DP:n monopolismo)))))' 161 """ 162 stack = [-1] 163 result = [] 164 for line in tree: 165 equals, rest = relinelev.match(line).groups() 166 linelev = len(equals) 167 while linelev < stack[-1]: 168 result[-1] += ')' 169 curlev = stack.pop() 170 if linelev == stack[-1]: 171 result[-1] += ')' 172 else: 173 stack.append(linelev) 174 result.append('(%s' % clean(rest)) 175 while stack[-1] >= 0: 176 result[-1] += ')' 177 stack.pop() 178 return ' '.join(result)

179

180 -def main():

181 """take a treebank from stdin in horizontal tree format, and output it 182 in s-expression format (ie., bracket notation, WSJ format). Checks 183 whether original sentence and leaves of the tree match, and discards 184 the tree if they don't. Also removes trees marked problematic with the 185 tag "CAVE" in the comments. Example input: 186 <s_id=812> 187 SOURCE: id=812 188 ID=812 Necesus adapti la metodon por iuj alilandaj klavaroj. 189 A1 190 STA:fcl 191 =P:v-fin("necesi" <*> <mv> COND VFIN) Necesus 192 =S:icl 193 ==P:v-inf("adapti" <mv>) adapti 194 ==Od:np 195 ===DN:art("la") la 196 ===H:n("metodo" <ac> S ACC) metodon 197 ===DN:pp 198 ====H:prp("por" <aquant>) por 199 ====DP:np 200 =====DN:pron("iu" <quant> DET P NOM) iuj 201 =====DN:adj("alilanda" P NOM) alilandaj 202 =====H:n("klavaro" <cc-h> <tool-mus> P NOM) klavaroj 203 . 204 205 </s> 206 """ 207 correct = n = cave = mismatch = failed = 0 208 s = False 209 for a in stdin: 210 if s and a[:4] == "</s>": 211 s = 0 212 if tree[-2].strip() == '.': 213 tree = tree[:-2] #+ [''] 214 period = ' .' 215 else: period = '' 216 x = "(TOP %s%s)" % (reparse(tree).replace('()',''), period) 217 218 try: 219 xx = cnf(Tree(x)) 220 except ValueError: 221 # this only happens when our output failes to parse (malformed s-expression -- eg. unbalanced parens): 222 stderr.write("""failed to parse: 223 input: %s 224 output: %s 225 """ % ("".join(tree), str(x))) 226 failed += 1 227 continue 228 if sent == leaves(xx): 229 # this tree is fine 230 stdout.write("%s\n" % x) 231 #stdout.write("%s\n" % str(xx).replace('\n','')) 232 correct += 1 233 else: 234 # this happens when the leaves do not agree with the original sentence in the comment line above the tree 235 stderr.write("""sentence-leaves mismatch! 236 expected: %s 237 got: %s 238 tree: 239 %s 240 """ % (repr(sent), repr(leaves(xx)), str(x))) 241 mismatch += 1 242 elif s and a[:2] == "ID": 243 # "ID=123 the sentence." => ['the', 'sentence', '.'] 244 # we need this monstrosity because word_tokenize does 245 # not tokenize "word.)" into three tokens like it 246 # should 247 # sent = wordpunct_tokenize(" ".join(word_tokenize(a[a.index(' ')+1:].replace('(','{').replace(')','}')))) 248 def mytokenize(a): 249 pass

250 sent = word_tokenize(a.split(' ', 1)[1].replace('(','{').replace(')','}')) 251 elif s and "CAVE" in a: 252 # skip trees with errors (a cave circularity is 253 # a circular dependency link, which is illegal) 254 cave += 1 255 s = 0 256 elif s: 257 s += 1 258 259 if a[0] == "#": 260 continue 261 262 if s >= 4: 263 tree.append(a) 264 265 if not s and a[:2] == "<s": 266 s = 1 267 n += 1 268 tree = [] 269 stderr.write("converted %d of %d trees in input\n" % (correct, n)) 270 stderr.write("cave circularities: %d, sentence-leaves mismatches: %d\nmalformed s-expression output: %d\n" % (cave, mismatch, failed)) 271 272 273 if __name__ == '__main__': 274 import doctest 275 # do doctests, but don't be pedantic about whitespace (I suspect it is the 276 # militant anti-tab faction who are behind this obnoxious default) 277 fail, attempted = doctest.testmod(verbose=False, 278 optionflags=doctest.NORMALIZE_WHITESPACE | doctest.ELLIPSIS) 279 if attempted and not fail: 280 stderr.write("%d doctests succeeded!\n" % attempted) 281 main() 282

Source Code for Module arbobanko