1
2 """treebank conversion script; expects no arguments, uses stdin & stdout.
3 input is VISL horizontal tree format
4 see: http://beta.visl.sdu.dk/treebanks.html#The_source_format
5 output: s-expression, ie., tree in bracket notation.
6 TODO: turn this into a nltk.Corpus reader
7 """
8
9 example = """X:np
10 =H:n("konsekvenco" <*> <ac> P NOM) Konsekvencoj
11 =DN:pp
12 ==H:prp("de") de
13 ==DP:np
14 ===DN:adj("ekonomia" <Deco> P NOM) ekonomiaj
15 ===H:n("transformo" P NOM) transformoj"""
16
17 example2 = """STA:fcl
18 =S:np
19 ==DN:pron-dem("tia" <*> <Dem> <Du> <dem> DET P NOM) Tiaj
20 ==H:n("akuzo" <act> <sd> P NOM) akuzoj
21 =fA:adv("certe") certe
22 =P:v-fin("dauxri" <va+TEMP> <mv> FUT VFIN) dauxros"""
23
24 example3 = """STA:par
25 =CJT:fcl
26 ==fA:adv("krome" <*>) Krome
27 ==,
28 ==S:np
29 ===DN:art("la") la
30 ===H:n("savo" <act> <event> S NOM) savo
31 ===DN:pp
32 ====H:prp("de") de
33 ====DP:np
34 =====H:n("konkuranto" <Hprof> S NOM) konkuranto
35 ==P:v-fin("helpi" <cjt-head> <vta+INF> <mv> FUT VFIN) helpos
36 =====DN:prop("Microsoft" <*> S NOM) Microsoft
37 =CJT:icl
38 ==P:v-pcp2("refi" <cjt-STA> <mv> PAS COND INF) refuti
39 ==Od:np
40 ===H:n("akuzo" <act> <sd> P ACC) akuzojn
41 ===DN:pp
42 ====H:prp("pri") pri
43 ====DP:n("monopolismo" S NOM) monopolismo"""
44
45 from nltk import Tree
46 from nltk.tokenize import word_tokenize, wordpunct_tokenize
47 from sys import stdin, stdout, stderr
48 import re
49 from itertools import chain
50
52 """ make sure all terminals have POS tags;
53 invent one if necessary ("parent_word") """
54 result = tree.copy(True)
55 for a in tree.treepositions('leaves'):
56 if len(tree[a[:-1]]) != 1:
57 result[a] = Tree("%s_%s" % (tree[a[:-1]].node, tree[a]), [tree[a]])
58 for a in tree.treepositions():
59 if isinstance(tree[a], Tree) and len(tree[a]) == 0:
60 tree[a] += tree[a].node
61
62 return result
63
65 """include "non-terminals" if they have no children"""
66 def node(a):
67 if isinstance(a, Tree): return a.node
68 else: return a
69 def splitword(a):
70 return node(a).split("_")
71 return list(reduce(chain, [splitword(xx[a]) for a in xx.treepositions() if (not isinstance(xx[a], Tree)) or len(xx[a]) == 0]))
72
73 -def parse(input, stripmorph=True):
74 """parse a horizontal tree into an s-expression (ie., WSJ format).
75 Defaults to stripping morphology information.
76 Parentheses in the input are converted to braces.
77
78 >>> print example
79 X:np
80 =H:n("konsekvenco" <*> <ac> P NOM) Konsekvencoj
81 =DN:pp
82 ==H:prp("de") de
83 ==DP:np
84 ===DN:adj("ekonomia" <Deco> P NOM) ekonomiaj
85 ===H:n("transformo" P NOM) transformoj
86 >>> parse(example.splitlines())
87 '(X:np (H:n Konsekvencoj) (DN:pp (H:prp de) (DP:np (DN:adj ekonomiaj) (H:n transformoj))))'
88 >>> print example2
89 STA:fcl
90 =S:np
91 ==DN:pron-dem("tia" <*> <Dem> <Du> <dem> DET P NOM) Tiaj
92 ==H:n("akuzo" <act> <sd> P NOM) akuzoj
93 =fA:adv("certe") certe
94 =P:v-fin("dauxri" <va+TEMP> <mv> FUT VFIN) dauxros
95
96 >>> parse(example2.splitlines())
97 '(STA:fcl (S:np (DN:pron-dem Tiaj) (H:n akuzoj)) (fA:adv certe) (P:v-fin dauxros))'
98 >>> parse(example3.splitlines())
99 '(STA:par (CJT:fcl (fA:adv Krome) (,) (S:np (DN:art la) (H:n savo) (DN:pp (H:prp de) (DP:np (H:n konkuranto)))) (P:v-fin helpos (((DN:prop Microsoft))))) CJT:icl (P:v-pcp2 refuti) (Od:np (H:n akuzojn) (DN:pp (H:prp pri) (DP:n monopolismo))))'
100 """
101 n = -1
102 open = 1
103 out = []
104 for a in input:
105 if '("' in a:
106 w = True
107 else: w = False
108 if a.count('=') == n and n != 0:
109 out += ') ('
110 else:
111 if a.count('=') < n:
112
113 out += ')' * (1 + n - a.count('='))
114 open -= (1 + n - a.count('='))
115 if a.count('=') > n:
116 out += ' (' * ((a.count('=') - n))
117 open += (a.count('=') - n)
118 elif w:
119 out += ' ('
120 open += 1
121 n = a.count('=')
122
123 if stripmorph:
124 word = re.sub("=+|\(.*\)", "", a).replace('(','{').replace(')','}').split()
125 else:
126 word = a.replace('(','{').replace(')','}').split()
127
128 if len(word) == 1: word == (word, word)
129 out += " " + " ".join(word)
130
131 out += (open - 1) * ')'
132 return "".join(out).replace('( ', '(')[1:]
133
134
135 relinelev = re.compile(r'(=*)(.*)')
136 reclean = re.compile(r'\s*\((\S+)[^)]*\)')
137
138 -def clean(a, stripmorph=True):
139
140 if stripmorph:
141 return " ".join(re.sub("\(.*\)", "", a).replace('(','{').replace(')','}').split())
142
143
144 else:
145 return a.replace('(','{').replace(')','}').split()
146
148 """following code contributed by Alex Martelli at StackOverflow:
149 http://stackoverflow.com/questions/2815020/converting-a-treebank-of-vertical-trees-to-s-expressions
150
151 parse a horizontal tree into an s-expression (ie., WSJ format).
152 Defaults to stripping morphology information.
153 Parentheses in the input are converted to braces.
154
155 >>> reparse(example.splitlines())
156 '(X:np (H:n Konsekvencoj) (DN:pp (H:prp de) (DP:np (DN:adj ekonomiaj) (H:n transformoj))))'
157 >>> reparse(example2.splitlines())
158 '(STA:fcl (S:np (DN:pron-dem Tiaj) (H:n akuzoj)) (fA:adv certe) (P:v-fin dauxros))'
159 >>> reparse(example3.splitlines())
160 '(STA:par (CJT:fcl (fA:adv Krome) (,) (S:np (DN:art la) (H:n savo) (DN:pp (H:prp de) (DP:np (H:n konkuranto)))) (P:v-fin helpos (DN:prop Microsoft))) (CJT:icl (P:v-pcp2 refuti) (Od:np (H:n akuzojn) (DN:pp (H:prp pri) (DP:n monopolismo)))))'
161 """
162 stack = [-1]
163 result = []
164 for line in tree:
165 equals, rest = relinelev.match(line).groups()
166 linelev = len(equals)
167 while linelev < stack[-1]:
168 result[-1] += ')'
169 curlev = stack.pop()
170 if linelev == stack[-1]:
171 result[-1] += ')'
172 else:
173 stack.append(linelev)
174 result.append('(%s' % clean(rest))
175 while stack[-1] >= 0:
176 result[-1] += ')'
177 stack.pop()
178 return ' '.join(result)
179
181 """take a treebank from stdin in horizontal tree format, and output it
182 in s-expression format (ie., bracket notation, WSJ format). Checks
183 whether original sentence and leaves of the tree match, and discards
184 the tree if they don't. Also removes trees marked problematic with the
185 tag "CAVE" in the comments. Example input:
186 <s_id=812>
187 SOURCE: id=812
188 ID=812 Necesus adapti la metodon por iuj alilandaj klavaroj.
189 A1
190 STA:fcl
191 =P:v-fin("necesi" <*> <mv> COND VFIN) Necesus
192 =S:icl
193 ==P:v-inf("adapti" <mv>) adapti
194 ==Od:np
195 ===DN:art("la") la
196 ===H:n("metodo" <ac> S ACC) metodon
197 ===DN:pp
198 ====H:prp("por" <aquant>) por
199 ====DP:np
200 =====DN:pron("iu" <quant> DET P NOM) iuj
201 =====DN:adj("alilanda" P NOM) alilandaj
202 =====H:n("klavaro" <cc-h> <tool-mus> P NOM) klavaroj
203 .
204
205 </s>
206 """
207 correct = n = cave = mismatch = failed = 0
208 s = False
209 for a in stdin:
210 if s and a[:4] == "</s>":
211 s = 0
212 if tree[-2].strip() == '.':
213 tree = tree[:-2]
214 period = ' .'
215 else: period = ''
216 x = "(TOP %s%s)" % (reparse(tree).replace('()',''), period)
217
218 try:
219 xx = cnf(Tree(x))
220 except ValueError:
221
222 stderr.write("""failed to parse:
223 input: %s
224 output: %s
225 """ % ("".join(tree), str(x)))
226 failed += 1
227 continue
228 if sent == leaves(xx):
229
230 stdout.write("%s\n" % x)
231
232 correct += 1
233 else:
234
235 stderr.write("""sentence-leaves mismatch!
236 expected: %s
237 got: %s
238 tree:
239 %s
240 """ % (repr(sent), repr(leaves(xx)), str(x)))
241 mismatch += 1
242 elif s and a[:2] == "ID":
243
244
245
246
247
248 def mytokenize(a):
249 pass
250 sent = word_tokenize(a.split(' ', 1)[1].replace('(','{').replace(')','}'))
251 elif s and "CAVE" in a:
252
253
254 cave += 1
255 s = 0
256 elif s:
257 s += 1
258
259 if a[0] == "#":
260 continue
261
262 if s >= 4:
263 tree.append(a)
264
265 if not s and a[:2] == "<s":
266 s = 1
267 n += 1
268 tree = []
269 stderr.write("converted %d of %d trees in input\n" % (correct, n))
270 stderr.write("cave circularities: %d, sentence-leaves mismatches: %d\nmalformed s-expression output: %d\n" % (cave, mismatch, failed))
271
272
273 if __name__ == '__main__':
274 import doctest
275
276
277 fail, attempted = doctest.testmod(verbose=False,
278 optionflags=doctest.NORMALIZE_WHITESPACE | doctest.ELLIPSIS)
279 if attempted and not fail:
280 stderr.write("%d doctests succeeded!\n" % attempted)
281 main()
282