Ñò
©­>Lc           @   sÖ   d  Z  d d k l Z d d k l Z l Z d d k l Z d d k l	 Z	 l
 Z
 l Z l Z d d d „  ƒ  YZ e d j oT d d	 k Z e i d
 e d e i e i Bƒ \ Z Z e o e o d e GHqÒ n d	 S(   s   Shell interface to bitpar, an efficient chart parser for (P)CFGs.
Expects bitpar to be compiled and available in the PATH.
Currently only yields the one best parse tree without its probability.
Todo: 
 - yield n best parses with probabilites (parameter)
 - parse chart outputiÿÿÿÿ(   t   defaultdict(   t   Popent   PIPE(   t   uuid1(   t   Treet   ProbabilisticTreet   FreqDistt   InsideChartParsert   BitParChartParserc        	   B   sb   e  Z d	 d	 d	 d	 d	 e d  d d „ Z d „  Z d „  Z d „  Z d „  Z d	 d „ Z	 d „  Z
 RS(
   i
   t    c	   	      C   s½   | |  _  | |  _ | |  _ | |  _ | o | |  _ n t ƒ  |  _ | |  _ | |  _ | |  _ | |  _	 | o) | o" |  i
 d |  i d |  i ƒ n | p t d ƒ ‚ n |  i ƒ  d S(   sN   Interface to bitpar chart parser. Expects a list of weighted
		productions with frequencies (not probabilities).
		
		@param weightedrules: sequence of tuples with strings 
			(lhs and rhs separated by tabs, eg. "S NP VP") and
			frequencies. The reason we use this format is that
			it is close to bitpar's file format; converting a
			weighted grammar with probabilities to frequencies
			would be a detour, and bitpar wants frequencies so
			it can do smoothing.
		@param lexicon: set of strings belonging to the lexicon
			(ie., the set of terminals)
		@param rootsymbol: starting symbol for the grammar
		@param unknownwords: a file with a list of open class POS tags 
			with frequencies
		@param openclassdfsa: a deterministic finite state automaton,
			refer to the bitpar manpage.
		@param cleanup: boolean, when set to true the grammar files will be
			removed when the BitParChartParser object is deleted.
		@param name: filename of grammar files in case you want to export it,
			if not given will default to a unique identifier
		@param n: the n best parse trees will be requested
		>>> wrules = (	("S\tNP\tVP", 1), 				("NP\tmary", 1), 				("VP\twalks", 1) )
		>>> p = BitParChartParser(wrules, set(("mary","walks")))
		>>> tree = p.parse("mary walks".split())
		>>> print tree
		(S (NP mary) (VP walks)) (p=1.0)

		>>> from dopg import GoodmanDOP
		>>> d = GoodmanDOP([tree], parser=InsideChartParser)
		>>> d.parser.parse("mary walks".split())
		ProbabilisticTree('S', [ProbabilisticTree('NP@1', ['mary'])
		(p=1.0), ProbabilisticTree('VP@2', ['walks']) (p=1.0)])
		(p=0.444444444444)
		>>> d.parser.nbest_parse("mary walks".split(), 10)
		[ProbabilisticTree('S', [ProbabilisticTree('NP@1', ['mary']) (p=1.0),
			ProbabilisticTree('VP@2', ['walks']) (p=1.0)]) (p=0.444444444444),
		ProbabilisticTree('S', [ProbabilisticTree('NP', ['mary']) (p=1.0),
			ProbabilisticTree('VP@2', ['walks']) (p=1.0)]) (p=0.222222222222),
		ProbabilisticTree('S', [ProbabilisticTree('NP@1', ['mary']) (p=1.0),
			ProbabilisticTree('VP', ['walks']) (p=1.0)]) (p=0.222222222222),
		ProbabilisticTree('S', [ProbabilisticTree('NP', ['mary']) (p=1.0),
			ProbabilisticTree('VP', ['walks']) (p=1.0)]) (p=0.111111111111)]

		>>> d = GoodmanDOP([tree], parser=BitParChartParser)
		    writing grammar
		>>> d.parser.parse("mary walks".split())
		ProbabilisticTree('S', [Tree('NP@1', ['mary']), Tree('VP@2', ['walks'])]) (p=0.444444)
		>>> list(d.parser.nbest_parse("mary walks".split()))
		[ProbabilisticTree('S', [Tree('NP@1', ['mary']), Tree('VP@2', ['walks'])]) 
		(p=0.444444),
		ProbabilisticTree('S', [Tree('NP', ['mary']), Tree('VP@2', ['walks'])])
		(p=0.222222),
		ProbabilisticTree('S', [Tree('NP@1', ['mary']), Tree('VP', ['walks'])])
		(p=0.222222), 
		ProbabilisticTree('S', [Tree('NP', ['mary']), Tree('VP', ['walks'])])
		(p=0.111111)]

		TODO: parse bitpar's chart output / parse forest
		s   /tmp/g%s.pcfgs   /tmp/g%s.lexs   need grammar or file nameN(   t   grammart   lexicont
   rootsymbolt   namet   idR   t   cleanupt   nt   unknownwordst   openclassdfsat   writegrammart
   ValueErrort   start(	   t   selft   weightedrulesR   R   R   R   R   R   R   (    (    s0   /home/andreas/sslp/dop-transformations/bitpar.pyt   __init__   s     @				 				"c         C   sB   d |  i  |  i  f } |  i o t | i ƒ  ƒ n |  i ƒ  d  S(   Ns   rm /tmp/g%s.pcfg /tmp/g%s.lex(   R   R   R   t   splitt   stop(   R   t   cmd(    (    s0   /home/andreas/sslp/dop-transformations/bitpar.pyt   __del__^   s    
 c         C   sö   d |  i  |  _ |  i o |  i d |  i 7_ n |  i o |  i d |  i 7_ n |  i o |  i d |  i 7_ n |  i o# |  i d |  i |  i f 7_ n  |  i d |  i |  i f 7_ t |  i i ƒ  d t	 d t	 d t	 ƒ|  _
 d  S(	   Ns   bitpar -q -b %d -vp -p s   -s %s s   -u %s s   -w %s s   /tmp/g%s.pcfg /tmp/g%s.lext   stdint   stdoutt   stderr(   R   R   R   R   R   R   R   R   R   R   t   bitpar(   R   (    (    s0   /home/andreas/sslp/dop-transformations/bitpar.pyR   c   s    
 
 
 
#c         C   s.   t  |  i i ƒ  t ƒ p |  i i ƒ  n d  S(   N(   t
   isinstanceR    t   pollt   intt	   terminate(   R   (    (    s0   /home/andreas/sslp/dop-transformations/bitpar.pyR   s   s    c         C   sF  t  |  i i ƒ  t ƒ o |  i ƒ  n y) |  i i d d i | ƒ ƒ \ } } WnX |  i ƒ  |  i i i ƒ  GH|  i i	 i ƒ  GH|  i i d d i | ƒ ƒ \ } } n Xd | j o& t
 d | i ƒ  | i ƒ  f ƒ ‚ n | i d d ƒ d | i d d ƒ d } } t | i d ƒ d ƒ } t | ƒ } t | i | d | ƒS(	   Nu   %s

s   
t   =u"   no output. stdout: 
%s
stderr:
%s i   i    i   t   prob(   R!   R    R"   R#   R   t   communicatet   joinR   t   readR   R   t   stripR   t   floatR   R   t   node(   R   t   sentt   resultR   R&   t   tree(    (    s0   /home/andreas/sslp/dop-transformations/bitpar.pyt   parsew   s     )
)&-c         C   s­   t  |  i i ƒ  t ƒ o |  i ƒ  n |  i i d d i | ƒ ƒ \ } } | i ƒ  d  } d „  | d d d … Dƒ } d „  | d d d … Dƒ } d	 „  t | | ƒ Dƒ S(
   s¶    n has to be specified in the constructor because it is specified
		as a command line parameter to bitpar, allowing it here would require
		potentially expensive restarts of bitpar. u   %s

s   
iÿÿÿÿc         s   s=   x6 |  ]/ } d  | j o t  | i d  ƒ d ƒ Vq q Wd S(   R%   i   N(   R+   R   (   t   .0t   a(    (    s0   /home/andreas/sslp/dop-transformations/bitpar.pys	   <genexpr>   s   	 Ni   c         s   s   x |  ] } t  | ƒ Vq Wd  S(   N(   R   (   R1   R2   (    (    s0   /home/andreas/sslp/dop-transformations/bitpar.pys	   <genexpr>‘   s   	 i   c         s   s1   x* |  ]# \ } } t  | i | d  | ƒVq Wd S(   R&   N(   R   R,   (   R1   R2   t   b(    (    s0   /home/andreas/sslp/dop-transformations/bitpar.pys	   <genexpr>’   s   	 (	   R!   R    R"   R#   R   R'   R(   t
   splitlinest   zip(   R   R-   t   n_will_be_ignoredR.   R   t   resultst   probst   trees(    (    s0   /home/andreas/sslp/dop-transformations/bitpar.pyt   nbest_parse‰   s     %c            s   t  | d ƒ t  | d ƒ } } t t ƒ ‰ t t ƒ ‰ ‡ ‡  f d †  } | i | ƒ  ƒ d „  } | i | ˆ ƒ ƒ | i ƒ  | i ƒ  d S(   sŒ    write a grammar to files f and l in a format that bitpar 
		understands. f will contain the grammar rules, l the lexicon 
		with pos tags. t   wc          3   s»   x´ ˆ i  D]© \ }  } |  i d d ƒ d |  i d ƒ d } } | d ˆ i j o ˆ  | d i | ƒ q
 t | ƒ d j p d d „  | Dƒ j o q
 q
 d t | ƒ |  f Vq
 Wd  S(   Ns   	i   i    R	   c         s   s%   x |  ] } t  | ƒ i ƒ  Vq Wd  S(   N(   t   strR*   (   R1   R2   (    (    s0   /home/andreas/sslp/dop-transformations/bitpar.pys	   <genexpr>£   s   	 u   %s	%s
(   R
   R   R   t   inct   lent   repr(   t   rulet   freqt   lhst   rhs(   t   lexR   (    s0   /home/andreas/sslp/dop-transformations/bitpar.pyt   process›   s    
 ** c         s   sI   xB |  i  ƒ  D]4 \ } } d | d i d „  | i  ƒ  Dƒ ƒ f Vq Wd  S(   Nu   %s	%s
s   	c         s   s@   x9 |  ]2 } | d  i  ƒ  o d i t t | ƒ ƒ Vq q Wd S(   i    t    N(   R*   R(   t   mapR<   (   R1   R2   (    (    s0   /home/andreas/sslp/dop-transformations/bitpar.pys	   <genexpr>°   s   	 (   t   itemsR(   (   RD   t   wordt   tags(    (    s0   /home/andreas/sslp/dop-transformations/bitpar.pyt   proc¬   s     N(   t   openR    t   listR   t
   writelinest   close(   R   t   ft   lRE   RK   (    (   R   RD   s0   /home/andreas/sslp/dop-transformations/bitpar.pyR   ”   s    	
N(   t   __name__t
   __module__t   Nonet   TrueR   R   R   R   R0   R:   R   (    (    (    s0   /home/andreas/sslp/dop-transformations/bitpar.pyR      s   !P				t   __main__Nt   verboset   optionflagss   %d doctests succeeded!(    (   t   __doc__t   collectionsR    t
   subprocessR   R   t   uuidR   t   nltkR   R   R   R   R   RR   t   doctestt   testmodt   Falset   NORMALIZE_WHITESPACEt   ELLIPSISt   failt	   attempted(    (    (    s0   /home/andreas/sslp/dop-transformations/bitpar.pyt   <module>   s   "¨