Ñò
†ËKc           @   sû   d  Z  d d k l Z d d k l Z l Z d d k l Z l Z l	 Z	 d d k l
 Z
 l Z l Z l Z d „  Z d d d „  ƒ  YZ d	 „  Z e d
 j o[ d d k Z e i d e d e i e i Bƒ \ Z Z e o e o d e GHn e ƒ  n d S(   sB   DOP1 implementation. Andreas van Cranenburgh <andreas@unstable.nl>iÿÿÿÿ(   t   defaultdict(   t   chaint   count(   t
   Productiont   WeightedProductiont   WeightedGrammar(   t   Treet   Nonterminalt   FreqDistt   InsideChartParserc         c   s1   x* |  D]" } x | D] } | | f Vq Wq Wd S(   s$    cartesian product of two sequences N(    (   t   at   bt   xt   y(    (    s   /home/andreas/coglang/dopg.pyt   cartprod
   s
      t
   GoodmanDOPc           B   sD   e  Z d  d „ Z d „  Z d d „ Z d „  Z d „  Z d „  Z RS(   t   Sc         C   s  g  t  ƒ  t ƒ  } } } xY | D]Q } |  i | | ƒ } |  i | | ƒ |  i | | ƒ | i |  i | | ƒ ƒ q! W|  i | | ƒ |  _ | i ƒ  |  _	 | i
 ƒ  |  _ t d „  | i
 ƒ  Dƒ ƒ |  _ t t | ƒ |  i ƒ |  _ t |  i ƒ |  _ |  ` ~ ~ d S(   s¯   initialize a DOP model given a treebank. uses the Goodman
		reduction of a STSG to a PCFG.  after initialization,
		self.parser will contain an InsideChartParser.

		>>> tree = Tree("(S (NP mary) (VP walks))")
		>>> d = GoodmanDOP([tree])
		>>> d.parser.nbest_parse("mary walks".split())[0]
		ProbabilisticTree('S', [ProbabilisticTree('NP@1', ['mary']) (p=0.5), 
		ProbabilisticTree('VP@2', ['walks']) (p=0.5)]) (p=0.111111111111)c         s   s@   x9 |  ]2 } d  | j o | i  d  ƒ d d d … Vq q Wd S(   t   @Niÿÿÿÿ(   t   split(   t   .0R
   (    (    s   /home/andreas/coglang/dopg.pys	   <genexpr>/   s   	 N(   R   R   t   decorate_with_idst   nodefreqt   extendt   goodmant   probabilitiest   pcfgt   nextt	   addressest   keyst   nonterminalst   dictt   nonterminalR   R   t   grammarR	   t   parser(   t   selft   treebankt
   rootsymbolt   cfgt   nonterminalfdt   uuidst   treet   utree(    (    s   /home/andreas/coglang/dopg.pyt   __init__   s    
 	c         C   sF   | i  t ƒ } x0 | i ƒ  D]" } d | i | i ƒ  f | _ q W| S(   s$   add unique identifiers to each non-terminal of a tree.

		>>> tree = Tree("(S (NP mary) (VP walks))")
		>>> d = GoodmanDOP([tree])
		>>> d.decorate_with_ids(tree, count())
		Tree('S@0', [Tree('NP@1', ['mary']), Tree('VP@2', ['walks'])])

			@param uuids: an iterator yielding a stream of IDss   %s@%d(   t   copyt   Truet   subtreest   nodeR   (   R"   R(   R'   R)   R
   (    (    s   /home/andreas/coglang/dopg.pyR   6   s
    	  i   c            s„   t  | t ƒ oS t | ƒ d j o@ t d „  ‡ ‡  f d †  | Dƒ ƒ } ˆ  i | i d | ƒ| Sˆ  i t | ƒ d | ƒ| Sd S(   sf  count frequencies of nodes by calculating the number of
		subtrees headed by each node.

		>>> fd = FreqDist()
		>>> tree = Tree("(S (NP mary) (VP walks))")
		>>> d = GoodmanDOP([tree])
		>>> d.nodefreq(tree, fd)
		9
		>>> fd.items()
		[('S', 9), ('NP', 2), ('VP', 2), ('mary', 1), ('walks', 1)]

			@param nonterminalfd: the FreqDist to store the counts in.i    c         S   s   |  | S(    (    (   R   R   (    (    s   /home/andreas/coglang/dopg.pyt   <lambda>R   s    c         3   s)   x" |  ] } ˆ  i  | ˆ ƒ d  Vq Wd S(   i   N(   R   (   R   R   (   R"   R&   (    s   /home/andreas/coglang/dopg.pys	   <genexpr>S   s   	 R   N(   t
   isinstanceR   t   lent   reducet   incR.   t   str(   R"   R(   R&   t   leavest   n(    (   R&   R"   s   /home/andreas/coglang/dopg.pyR   D   s    #	c         c   sÃ   x¼ t  | i ƒ  | i ƒ  ƒ D]Ÿ \ } } t | i ƒ  ƒ d j o | i ƒ  | i ƒ  f } n" t t  | i ƒ  | i ƒ  ƒ Œ  } x: t | i ƒ  | i ƒ  f | ƒ D] \ } } | | f Vq  Wq Wd S(   sÛ   given a parsetree from a treebank, yield a goodman
		reduction of eight rules per node.

		>>> tree = Tree("(S (NP mary) (VP walks))")
		>>> d = GoodmanDOP([tree])
		>>> utree = d.decorate_with_ids(tree, count())
		>>> list(d.goodman(tree, utree))
		[(S, (NP, VP)), (S, (NP, VP@2)), (S, (NP@1, VP)), (S, (NP@1, VP@2)), 
		(NP, ('mary',)), (NP, ('mary',)), (NP@1, ('mary',)), (NP@1, ('mary',)),
		(VP, ('walks',)), (VP, ('walks',)), (VP@2, ('walks',)), 
		(VP@2, ('walks',))]i   N(   t   zipt   productionsR1   t   rhsR   t   lhs(   R"   R(   R)   t   pt   upR9   t   lt   r(    (    s   /home/andreas/coglang/dopg.pyR   Z   s      !" c            sL   ‡  f d †  } g  } | D]+ \ } } | t  | | d | | | ƒ ƒq ~ S(   sÆ   merge cfg and frequency distribution into a pcfg with the right
		probabilities.

			@param cfg: a list of Productions
			@param nonterminalfd: a FreqDist of (non)terminals (with and
			without IDs)c            s6   t  d „  t ‡  f d †  | ƒ ƒ t ˆ  t |  ƒ ƒ S(   Nc         S   s   |  | S(    (    (   R   R   (    (    s   /home/andreas/coglang/dopg.pyR/      s    c            s(   d  t  |  ƒ j o ˆ  t  |  ƒ p d S(   R   i   (   R4   (   t   z(   R&   (    s   /home/andreas/coglang/dopg.pyR/      s   (   R2   t   mapt   floatR4   (   R=   R>   (   R&   (    s   /home/andreas/coglang/dopg.pyt   prob€   s    RB   (   R   (   R"   R%   R&   RB   t   _[1]R=   R>   (    (   R&   s   /home/andreas/coglang/dopg.pyR   x   s    	c            ss  ‡ ‡ f d †  } t  t ƒ ‰  t  t ƒ ‰ x.t d t ˆ ƒ d ƒ D]} x
t d t ˆ ƒ | ƒ D]ï ‰ ˆ | d ‰ x' ˆ i D] } | ˆ ˆ | ƒ ˆ  | <q‚ WxJ t ˆ i ƒ D]9 } ˆ i | } ˆ  | c | ˆ ˆ d | | f ƒ 7<q² Wt ‡  f d †  ˆ i Dƒ ƒ } t ‡ ‡ ‡ f d †  t ˆ ˆ ƒ Dƒ ƒ } ˆ  | ƒ | ˆ ˆ ˆ f <qd WqD Wˆ d t ˆ ƒ d f S(   s#   not working yet. almost verbatim translation of Goodman's (2003)
		most constituents correct parsing algorithm, except for python's
		zero-based indexing. needs to be modified to return the actual parse
		tree. expects a pcfg in the form of a dictionary from productions to
		probabilities c            sT   ‡  ‡ f d †  } ‡  ‡ f d †  } | |  | | ƒ | |  | | ƒ | d t  t ƒ S(   Nc            s.   ˆ  i  t t ˆ d |  !| g ˆ |  d ƒ S(   Ni   (   R   R   R$   (   t   st   tR   (   R"   t   sent(    s   /home/andreas/coglang/dopg.pyt   f’   s    c            s   ˆ  i  t | ˆ |  | d !ƒ S(   Ni   (   R   R   (   RD   RE   R   (   R"   RF   (    s   /home/andreas/coglang/dopg.pyt   e•   s    i   (   R6   R$   (   RD   RE   R   RG   RH   (   R"   RF   (    s   /home/andreas/coglang/dopg.pyt   g‘   s    i   i   s   %s@%dc         3   s   x |  ] } ˆ  | Vq Wd  S(   N(    (   R   R   (   t   sumx(    s   /home/andreas/coglang/dopg.pys	   <genexpr>¤   s   	 c         3   s5   x. |  ]' } ˆ  ˆ | f ˆ  | d  ˆ f Vq Wd S(   i   N(    (   R   R>   (   t   maxcRD   RE   (    s   /home/andreas/coglang/dopg.pys	   <genexpr>§   s   	(   R    t   intt   rangeR1   R   R   R   t   max(   R"   RF   RI   t   lengthR   t   kt   max_xt
   best_split(    (   RJ   RD   R"   RE   RK   RF   s   /home/andreas/coglang/dopg.pyt   parse‹   s(      
  *"(   t   __name__t
   __module__R*   R   R   R   R   RS   (    (    (    s   /home/andreas/coglang/dopg.pyR      s   %			c          C   s   d i  d ƒ }  t d „  |  Dƒ d d ƒ} | i GHd } xA | o9 d Gt ƒ  } x% | i i | i  ƒ  ƒ D] } | GHqi Wq< Wd S(	   s    a basic REPL for testing s   (S (NP John) (VP (V likes) (NP Mary)))
(S (NP Peter) (VP (V hates) (NP Susan)))
(S (NP Harry) (VP eats (NP pizza)))
(S (NP Hermione) (VP eats))s   
c         s   s   x |  ] } t  | ƒ Vq Wd  S(   N(   R   (   R   R
   (    (    s   /home/andreas/coglang/dopg.pys	   <genexpr>¹   s   	 R$   R   s   foo!s	   sentence:N(   R   R   R    t	   raw_inputR!   t   nbest_parse(   t   corpust   dt   wR(   (    (    s   /home/andreas/coglang/dopg.pyt   main®   s     	 t   __main__Nt   verboset   optionflagss   %d doctests succeeded!(    (   t   __doc__t   collectionsR    t	   itertoolsR   R   t   nltkR   R   R   R   R   R   R	   R   R   R[   RT   t   doctestt   testmodt   Falset   NORMALIZE_WHITESPACEt   ELLIPSISt   failt	   attempted(    (    (    s   /home/andreas/coglang/dopg.pyt   <module>   s   "	ž	