
H=Sc           @  s  d  Z  d d l m Z d d l Z d d l Z d d l Z d d l Z d d l Z d d l Z d d l	 Z	 d d l
 Z
 d d l Z d d l Z d d l m Z d d l m Z d d l m Z m Z d d l m Z m Z d d l m Z d d	 l m Z m Z m Z y d d l Z Wn e k
 r5d d l Z n Xd d l Z d d
 l  m! Z! m" Z" d d l  m# Z# m$ Z$ m% Z% d d l& m' Z' y( d d l( Z( d d l) j* j+ Z, e- Z. Wn e k
 re/ Z. n Xd d l0 m1 Z1 d d l2 m3 Z3 m4 Z4 d d l5 m6 Z6 d d l7 m8 Z8 m9 Z9 m: Z: m; Z; d Z< d Z= d Z> d Z? d Z@ d ZA d ZB d ZC e! eD  ZE e jF d  ZG e jF d  ZH e jF d  ZI e jF d  ZJ e jF d  ZK i d d 6d  d! 6d" d# 6ZL eM eN d$ jO     ZP eE jQ d%  eE jQ d&  eE jQ d'  eE jQ d(  eE jQ d)  eE jQ d*  d+         ZR d,   ZS e/ d-  ZT d.   ZU e/ d/  ZV d0   ZW e/ d1  ZX eE jQ d2  d3    ZY eE jQ d4  d5    ZZ eE jQ d6  d7    Z[ eE jQ d8  d9    Z\ eE jQ d:  d;    Z] d< d= e- d>  Z^ d< e/ d?  Z_ d@   Z` dA   Za dB   Zb dC   Zc dD   Zd dE   Ze dF e jf f dG     YZg e4 jh ji dH e- dI e/ dJ e/ dK e/ dL e/ dM e/ dN e/ dO e/ dP e/ dQ e/ dR e- dS e/ dT dU  i eT dV 6eU dW 6eV dX 6eW dY 6eX dZ 6Zj e
 jk   xS e
 jl   eE jm f D]< Zn en jo e
 jp  en jq d[ jr e
 js dT d\ d] d^   qWed   \ Zt Zu Zv Zw Zx Zy eD d_ k reE jz dI e- d` da  n  d S(b   s   Web interface to search a treebank. Requires Flask, tgrep2
or alpinocorpus-python (for xpath queries), style. Expects one or more
treebanks with .mrg or .dact extension in the directory corpus/i(   t   print_functionN(   t   nlargest(   t   quote(   t   datetimet	   timedelta(   t   islicet   groupby(   t
   itemgetter(   t   Countert   OrderedDictt   defaultdict(   t   Flaskt   Response(   t   requestt   render_templatet   send_from_directory(   t
   url_encode(   t   DrawTree(   t   treebankt	   fragments(   t   which(   t   TgrepSearchert   DactSearchert   RegexSearchert   filterlabelsi   i   i
   i   i  i   t   nls   corpus/sB   ([_/*A-Z0-9]+)(?:\[[^ ]*\][0-9]?)?((?:-[_A-Z0-9]+)?(?:\*[0-9]+)? )s   -[_A-Z0-9]+s    ([^ ()]+)(?=[ )])s   \(([^ ()]+) \)s!   \.(?:mrg(?:\.t2c\.gz)?|dact|txt)$s   .mrg.t2c.gzt   tgrep2s   .dactt   xpaths   .tokt   regexs9   black red orange blue green turquoise slategray peru tealt   /s   /countss   /treess   /sentss	   /bracketss
   /fragmentsc          C  s  d }  t j d k r* t j j d  }  n d t j k rI t j d }  n  t t j  } |  r |  t k rn d St j j d  r t t j |   St	 d d t j d t
 d	 | d |  d
 t |  t j  d d t k d d t k St	 d d t j d d d t
 d	 | d d t k d d t k S(   s    Main search form & results page.R   t   outputs   Invalid argumenti  t   exports   searchresults.htmlt   formt   textst   selectedtextst   resultst	   havexpathR   t	   havetgrepR   s   search.htmlt   countsN(   s   Invalid argumenti  (   t   NoneR   t   patht   lstript   argsR"   t   DISPATCHt   getR   R   t   TEXTSt   CORPORA(   R   t   selected(    (    s6   /home/andreas/ai/mscthesis/disco-dop/web/treesearch.pyt   mainH   s$    	c      
     s   f d   t  t j  D } | d k rq t  d t }  j d  d k rh t | d d } | Sd } n| d	 k r t  d t } d
 } n | d k rl j d  d k r | d k r d   n d   t  j d d  j	  d | d t
 d | d k }  j d  d k rFt t j | d t d d d d S   f d   | D } | d } n t d |   t | d d } d | | j d <| S(    s-   Export search results to a file for download.c           s-   i  |  ]# } | t  t | t   d   q S(   t   engine(   t
   CORPUS_DIRR-   t   EXT(   t   .0t   n(   R    (    s6   /home/andreas/ai/mscthesis/disco-dop/web/treesearch.pys
   <dictcomp>m   s   	 R&   t   doexportR   t   jsont   mimetypes   application/jsons
   counts.csvR   s   fragments.txtt   sentst   bracketst   treesR1   R   s   <!-- %s:%s -->
%s

s	   %s:%s|%s
R   t   queryt
   maxresultst   clst   indenti   c         3  sT   |  ]J }  j  d   r7   | d | d | d f n | d d j d  Vq d S(   t   linenosi    i   i   s   
s   utf-8N(   R,   t   encode(   R4   t   a(   t   fmtR    (    s6   /home/andreas/ai/mscthesis/disco-dop/web/treesearch.pys	   <genexpr>   s   s   .txts   cannot export %ss
   text/plains   attachment; filename=s   Content-Disposition(   s   sentss   bracketss   trees(   s   bracketss   trees(   R"   R   R*   R&   t   TrueR,   R   t   fragmentsinresultsR.   R9   t	   SENTLIMITR7   t   dumpst   JsonSetEncodert
   ValueErrort   headers(   R    R   R/   R#   t   respt   filename(    (   RC   R    s6   /home/andreas/ai/mscthesis/disco-dop/web/treesearch.pyR   j   s8    		!	
c          #  s   j  d d  }  f d   t   D } | s\ d t t d d    } d | Vn  | d k r t  j  d	 d
  j  d |  } n  t t  } t t  } g  | j	   D] } t
 | ^ q }	 t j d |	    t  d  }
 | s,d d j d   t t |
  d d g d  D  Vn  x3t t |
 j    d d< f g d  D]	\ } \ } \ } } t   } d } i  } | d0 k rt   j  d k rPn  | } d d d d j d   t |
  D  f } nq d } | rd } t  j  d	 d
  j | |  } n  j  d d  } t  j  d	 d
  j | | d t } | sod | | | pf| f Vn  xt | j    D]\ } } | d0 k rt | | j	    } n4 | | j |  | | j |  t | j	    } | | }  j  d  rt  j  d   n t | } t
 | } | | | <| d k rLt | } nW | d k ret | } n> | d k r~t | } n% | d k r| | pd } n t  d | | | | <| | 7} | sd  | j d!  d!  t t d" | d# d d | p d d	  j  d	 d
    | | | f } t  | | p6t |  } | rR| | d Vqdd$ | | f VqqW| s{| d0 k	 rt j! |    | <n  | sUd% d& j d!  t | j	    d t | j	    | f Vd' Vt" | j	    d k rqUq^ j  d  r.t# | t" | j	    d( | d) d* Vq^t# | t" | j	    d+ | | f d) d, VqUqUW| r j  d  d- k rt$ j%   j&   d. d/ Vqt' j(    }   j) |  | j*   VWd0 QXnd1   } d2 t |
  d/ Vd3     j+ D  t   d4 k rxt,    f d5     j D  } g    j+ D] } | d ^ q7  d6 <d7   j- d6  j.   j/ d8 |  Vn: t,   f d9     j D  } d7   j.   j/ d8 |  Vt# | t" | j	    d: | d) d, d; t0 Vd0 S(=   sL  Produce graphs and tables for a set of queries.

	Queries should be given one per line, optionally prefixed by a name and
	a normalization query::

		[name: ][normquery<tab>]query

	returns one graph for each query, and an overview with totals (optionally
	per category, if the first letters of each corpus name form a small set);
	t   normR9   c           s-   i  |  ]# } | t  t | t   d   q S(   R1   (   R2   R-   R3   (   R4   R5   (   R    (    s6   /home/andreas/ai/mscthesis/disco-dop/web/treesearch.pys
   <dictcomp>   s   	 s   counts?R   t   csvs6   Counts from queries (<a href="%s">export to CSV</a>):
R<   R1   R   t	   normqueryt   indexs   <ol>%s</ol>
s   
c         s  s%   |  ] \ } } d  | | f Vq d S(   s   <li><a href="#q%d">%s</a>N(    (   R4   R5   R<   (    (    s6   /home/andreas/ai/mscthesis/disco-dop/web/treesearch.pys	   <genexpr>   s   s   Combined resultst   Overviewi   t    i    s   %sLegend:	%si@   t    s   	c         s  s1   |  ]' \ } } d  t  j | d  | f Vq d S(   s   <font color=%s>%s</font>t   blackN(   t   COLORSR,   (   R4   R5   R<   (    (    s6   /home/andreas/ai/mscthesis/disco-dop/web/treesearch.pys	   <genexpr>   s   t   indicess%   <a name=q%d><h3>%s</h3></a>
<pre>
%s
t   limitt   constst   wordsg      Y@s8   %s (<a href="browsesents?%s">browse</a>)    %5d %5.2f %%i(   t   textt   sents(   <span style="color: gray; ">%s%s</span>
s"   %s%6d            %5.2f %%
</span>
t   TOTALs   </pre>s   Absolute counts of %s:t   unitt   matchess0   Relative frequency of %s: (count / num_%s * 100)t   %R7   R?   i   Nc         S  s   d t  |  d  S(   Ns   %gi   (   t   round(   t   x(    (    s6   /home/andreas/ai/mscthesis/disco-dop/web/treesearch.pyt   <lambda>  s    s.   <h3><a name=q%d>Overview of patterns</a></h3>
c         S  s   h  |  ] } | d   q S(   i    (    (   R4   t   key(    (    s6   /home/andreas/ai/mscthesis/disco-dop/web/treesearch.pys	   <setcomp>  s   	 i   c         3  si   |  ]_ }  D]R } d  | | f   | j  g    j D] } | d | k r4 | ^ q4 j   f Vq q d S(   s   %s_%si    N(   t   ixRP   t   mean(   R4   R<   t   letterRc   (   t   dft   firstletters(    s6   /home/andreas/ai/mscthesis/disco-dop/web/treesearch.pys	   <genexpr>  s   t   categorys   <pre>
%s
</pre>t   float_formatc         3  s%   |  ] } |   | j    f Vq d  S(   N(   Re   (   R4   R<   (   Rg   (    s6   /home/andreas/ai/mscthesis/disco-dop/web/treesearch.pys	   <genexpr>  s   s8   Relative frequencies of patterns: (count / num_%s * 100)t   dosort(   RR   N(1   R,   R"   R   t   dictR.   R&   R
   R   t   listt   valuesR-   t   pandast	   DataFramet	   querydictt   joint	   enumeratet	   iteritemsR'   t   lent   columnsRD   t   sortedt   itemst   sumt   updatet   appendt   intt   NUMSENTSt   NUMCONSTt   NUMWORDSRI   t   ljustt   dispplott   Seriest   maxt   barplotR7   RG   t   to_dictt   iot   BytesIOt   to_csvt   getvalueRP   R	   R   t   describet	   to_stringt   False(    R    R6   RM   R/   t   urlt   normresultst   combinedt	   combined1R5   RP   t   queriest   nameRO   R<   t   cntst   sumtotalt   relfreqR#   t   legendRL   RV   t   cntt   textnoRW   RZ   t   totalt   outt   plott   tmpRC   t   overviewRc   (    (   Rg   Rh   R    s6   /home/andreas/ai/mscthesis/disco-dop/web/treesearch.pyR&      s    	#	;	
$



	$c         #  s4  t  }   f d   t    D } d t t d d     } d t   d  d k  ra   d n   d d  d t | | d	 f Vxt t t t	   j
 d
 d  j   d | d t d d   k d d   k  t d    D]8\ } \ } } | | } t | } x
t |  D] \ }	 \ } }
 } } } |	 d k r`t } d | | d | d f Vn  d | |
 d   k r{d n d d   k rd n d | |
 |
 f } y+ t | | d | j d t d t  } Wn) t k
 r} d |
 | | | f } n Xd |
 | | f } | VqWd Vq W| r+d n d Vd S(   s6   Return visualization of parse trees in search results.c           s-   i  |  ]# } | t  t | t   d   q S(   R1   (   R2   R-   R3   (   R4   R5   (   R    (    s6   /home/andreas/ai/mscthesis/disco-dop/web/treesearch.pys
   <dictcomp>"  s   	 s   trees?R   RN   sv   <pre>Query: %s
Trees (showing up to %d per text; export: <a href="%s">plain</a>, <a href="%s">with line numbers</a>):
R<   i   s   ...s
   &linenos=1R1   R   R=   t   nomorpht   nofunci    sK   ==&gt; %s: [<a href="javascript: toggle('n%d'); ">toggle</a>]
<span id=n%d>i   sp   <a href="/browse?text=%d&sent=%s%s%s">browse</a>|<a href="/browsesents?text=%d&sent=%s&highlight=%s">context</a>s   &nofuncRR   s   &nomorpht	   highlightt   unicodelinest   htmls   #%s 
ERROR: %s
%s
%s
s   #%s [%s]
%s
s   </span>s   </pre>s   No matches.N(   R   R"   R   Rl   Ru   t	   TREELIMITRs   R   Rw   R.   R,   R;   R   R-   RD   R   RZ   RI   (   R    t
   gotresultsR/   R   R5   RL   R#   R   RZ   t   mt   sentnot   treeR[   t   hight   linkt   treereprt   errt   line(    (   R    s6   /home/andreas/ai/mscthesis/disco-dop/web/treesearch.pyR;     sD     1

(		c      
   #  sC  t  }   f d   t    D } d | r1 d n d t t d d     f } d t   d  d	 k  rs   d n   d d	  d
 t | | d f Vxt t t t	   j
 d d  j   d | d t d |  t d    D]G\ } \ } } | | } t | }	 xt |  D]\ }
 \ } } }  |
 d k rUt } d |	 | | f Vn  d | | d   k rpd n d d   k rd n d | | | f } | rt j | j d d   } | j  d   } n. d j  f d   t | j d   D  } d t |  j d  | | f VqWd Vq W| r:d  n d! Vd" S(#   s:   Return search results as terminals or in bracket notation.c           s-   i  |  ]# } | t  t | t   d   q S(   R1   (   R2   R-   R3   (   R4   R5   (   R    (    s6   /home/andreas/ai/mscthesis/disco-dop/web/treesearch.pys
   <dictcomp>N  s   	 s   %s?%sR;   R9   R   RN   sz   <pre>Query: %s
Sentences (showing up to %d per text; export: <a href="%s">plain</a>, <a href="%s">with line numbers</a>):
R<   i   s   ...s
   &linenos=1R1   R   R=   R:   i    sC   
%s: [<a href="javascript: toggle('n%d'); ">toggle</a>] <ol id=n%d>sn   <a href="/browse?text=%d&sent=%s%s%s">draw</a>|<a href="/browsesents?text=%d&sent=%s&highlight=%s">context</a>R   s   &nofuncRR   R   s   &nomorphs    )s    -NONE-)s   <span class=r>%s</span>RS   c         3  s1   |  ]' \ } } |   k r% d  | n | Vq d S(   s   <span class=r>%s</span>N(    (   R4   Ra   t   word(   R   (    s6   /home/andreas/ai/mscthesis/disco-dop/web/treesearch.pys	   <genexpr>n  s   s   <li>#%s [%s] %si   s   </ol>s   </pre>s   No matches.N(   R   R"   R   Rl   Ru   RF   Rs   R   Rw   R.   R,   R9   R   R-   RD   t   cgit   escapet   replaceRr   t   splitt   strt   rjust(   R    t
   dobracketsR   R/   R   R5   RL   R#   R   RZ   R   R   R[   R   R   (    (   R    R   s6   /home/andreas/ai/mscthesis/disco-dop/web/treesearch.pyR9   K  sB     "

%%	c         C  s   t  |  d t S(   s   Wrapper.R   (   R9   RD   (   R    (    (    s6   /home/andreas/ai/mscthesis/disco-dop/web/treesearch.pyR:   v  s    c      
   #  s,    j  d d  d k r! d Vd St }   f d   t    D } t   } | s d t t d d	     } d
 t   d  d k  r   d n   d d  d t t | f Vn    j  d d  d k } | r t	 j
 j d t d d  n t	 j
 j d t d d  x t t   j  d d  j   d | d t d t  D] \ } \ } } }	 } | d k rkt } n  | rt j t j |	 j d    \ }
 } d t |
  d j |  f } n |	 j d d  d } | j | j d   q>W| r| rd Vd St j d t  B } | j |  | j   t	 j | j g d d d  \ } } Wd QX| rt  t t! | |  d  d!   } n! t  t t! | |  d  d"   } t } | sd# Vn  xD| D]<\ }
 } t } | r|
 \ }
 } d j d$   | D  } n  | r0| rd% |
 | | f Vqd |
 | f Vq| rd& t" |
 j d   t" | j d   f } t# j$ d' t% j& d | d   } t% j& |
  d | }
 n4 d( t" |
 j d   } t# j$ d' t% j& |
   }
 t' j$ d) |
  }
 d* | | |
 f VqW| s(d+ V| rd, Vq(d- t( t) f Vn  d S(.   s0   Extract recurring fragments from search results.R1   R   R   s.   Only implemented for tgrep2 and xpath queries.Nc           s-   i  |  ]# } | t  t | t   d   q S(   R1   (   R2   R-   R3   (   R4   R5   (   R    (    s6   /home/andreas/ai/mscthesis/disco-dop/web/treesearch.pys
   <dictcomp>  s   	 s
   fragments?R   RN   s   <pre>Query: %s
Fragments (showing up to %d fragments in the first %d search results from selected texts;
ordered by (freq ** 0.5 * numwords ** 2) <a href="%s">Export</a>):
R<   i   s   ...R   t   discRC   t   discbrackett   bracketR=   R:   i    s   utf-8s   %s	%s
RS   s    )s    -NONE-)s   
t   utf8s   No matches.t   deletei   Rc   c         S  s,   t  d   |  d d D  d |  d d S(   Nc         s  s   |  ] } | r d  Vq d S(   i   N(    (   R4   RB   (    (    s6   /home/andreas/ai/mscthesis/disco-dop/web/treesearch.pys	   <genexpr>  s    i    i   i   g      ?(   Ry   (   t   ff(    (    s6   /home/andreas/ai/mscthesis/disco-dop/web/treesearch.pyRb     s    c         S  s4   t  d   t j d |  d  D  d |  d d S(   Nc         s  s   |  ] } d  Vq d S(   i   N(    (   R4   t   _(    (    s6   /home/andreas/ai/mscthesis/disco-dop/web/treesearch.pys	   <genexpr>  s    s   [^ ()]\)i    i   i   g      ?(   Ry   t   ret   finditer(   R   (    (    s6   /home/andreas/ai/mscthesis/disco-dop/web/treesearch.pyRb     s   	s   <ol>c         s  s   |  ] } | p d  Vq d S(   RR   N(    (   R4   RB   (    (    s6   /home/andreas/ai/mscthesis/disco-dop/web/treesearch.pys	   <genexpr>  s    s	   %s	%s	%s
s(   <a href='/draw?tree=%s&sent=%s'>draw</a>s    <font color=red>\1</font>s    <a href='/draw?tree=%s'>draw</a>s   (<font color=blue>\1</font> )s   <li>freq=%3d [%s] %ss   </ol>s   </pre>s)   No fragments with freq > %d & nodes > %d.(*   R,   R   R"   t   setR   Rl   Ru   t	   FRAGLIMITRF   R   t   PARAMSRz   RD   Rs   R.   R9   R   t
   alpinotreet   ElementTreet
   fromstringRA   R   Rr   R   t   addt   tempfilet   NamedTemporaryFilet
   writelinest   flusht   regularR   R'   R   t   zipR   t	   GETLEAVESt   subR   R   t   GETFRONTIERNTSt   MINNODESt   MINFREQ(   R    R6   R   R/   t   uniquetreesR   R   R5   R   t   treestrR   R[   R   R   R#   t   approxcountst   freqR   (    (   R    s6   /home/andreas/ai/mscthesis/disco-dop/web/treesearch.pyRE   {  s    	 (	!"
*$+		s   /stylec          C  s  d   }  d   } t  j j d  d k rO t |   d d } d | j d <n t  j j d  d	 k r t t j t d
 d d d } nK t t d d t  j d t	 d t
 t  j  d d d |    d d t k  } d | j d <t j   t d d  j d  | j d <| S(   s-   Show simple surface characteristics of texts.c          3  s   t  j  t j j t d   s& d Vn  d Vd t t  Vd }  x% t D] } t t | j	    }  PqG Wxr |  D]j     f d   t D } t
 | j    } | d k ro t | |   d d d	   k r d	 n d
 Vqo qo Wd S(   s   Generate plots from results.s   *.txts   No .txt files found in corpus/
Using sentences extracted from parse trees.
Supply text files with original formatting
to get meaningful paragraph information.

s0   <a href="style?export=csv">Export to CSV</a><br>s$   Results based on first %d sentences.c           s)   i  |  ] } t  | j   d   |  q S(   i    (   t
   STYLETABLER,   (   R4   RB   (   t   field(    s6   /home/andreas/ai/mscthesis/disco-dop/web/treesearch.pys
   <dictcomp>  s   	 i    t   :R]   R_   RR   N(    (   t   globt   osR(   Rr   R2   t   minR}   R   Rw   t   keysR   Rn   R   (   t   fieldsRB   t   dataR   (    (   R   s6   /home/andreas/ai/mscthesis/disco-dop/web/treesearch.pyt   generate  s    c            s   t  j   }  t r0 t t t t j      n g    t j |   } | j	 d g    | j
   f d   t t j    D  |  j   S(   s   Generate CSV file.RZ   c         3  s9   |  ]/ \ } } | g g    D] } | | ^ q Vq d  S(   N(    (   R4   R   t   rowRc   (   R   (    s6   /home/andreas/ai/mscthesis/disco-dop/web/treesearch.pys	   <genexpr>  s   (   R   R   R   Rw   t   nextt   iterRn   RN   t   writert   writerowt	   writerowsRx   R   (   R   R   (    (   R   s6   /home/andreas/ai/mscthesis/disco-dop/web/treesearch.pyt   generatecsv  s    *R   RN   R8   s
   text/plains   attachment; filename=style.csvs   Content-DispositionR7   R?   i   s   application/jsons   searchresults.htmlR    R!   R"   R   t   styleR#   R$   R   s   max-age=604800, publics   Cache-Controli   i    s   %a, %d %b %Y %H:%M:%S UTCt   Expires(   R   R*   R,   R   RJ   R7   RG   R   t   stream_templateR-   R"   R.   R   t   utcnowR   t   strftime(   R   R   RK   (    (    s6   /home/andreas/ai/mscthesis/disco-dop/web/treesearch.pyR     s     		
s   /drawc          C  s  d t  j k rw d t t  j d d t  j k r] g  t  j d j d  D] }  |  pT d ^ qE n d  j d t d t  St t  j d  t t  j d  } } d t  j k } d	 t  j k } t j	 j
 t t | d
  } t j	 j |  rCt t t |  | d |   j d  } t t | | |   j d t d t  } n d t k rt t | d } d | } t d j | j |  } t j t j |  d | rd n d d | rd n d \ }	 }
 t |	 |
  j d t d t  } n t d t |   d | | f S(   s5   Produce a visualization of a tree on a separate page.R   s   <pre>%s</pre>R[   RS   R   R   RZ   R   R   s   .mrgi   R   R   s   .dacts   %dt	   functionsR   t
   morphologyR   s   no treebank available for "%s".s   <pre id="t%s">%s</pre>N(   R   R*   R   R   R'   RZ   RD   R|   R   R(   Rr   R2   R-   t   existsR   R   t   opent   decodeR   R.   t   filest   readR   R   R   R   RI   (   RB   R   R   R   R   RL   R   t   resultt   sentidR   R[   (    (    s6   /home/andreas/ai/mscthesis/disco-dop/web/treesearch.pyt   draw  s0    J'
$s   /browsesentsc            s  d }  d t  j k rd t  j k rt t  j d  } t t  j d  d } t t  j j d d   d } t t |  d |  t | |  d  } t d | |  d  } t | |  t |  } t j j	 t
 t | d  } t j j |  r;g  t t j | d	 d
 | |  D] } d j	 t j |   ^ q} n d t k rt
 t | d } g  t | |  D]= }	 t j t d j | j d |	 d   j d  j ^ qi} n t d t |   g  t | |  D]: \ }	 } |	 | k rd t j |  n t j |  ^ q} d }
 } t  j j d d  rd t t d t  j d d t  j j d d    } t
 t | t  t  j j d d  } t! t  j d  } d d j	 d   t | d  D  }
 xt | j"    D]\ }	 \ } } t t  j d j# | d | f d d1 } x | D] \ } }   } | | k oC| k  n r  j% d    d j	   f d   | D  } | | | d j& | d t' j |	 d d  t j |  f  | | | d <q| | k rPqqWqWn  d } | |  k rd  | | |  d | f } n  d! } | t | |  k  rId" | | |  d | f } n  t( d# d$ | d% | d d t | d& t | d' | d( | d) | d* |  d+ | d d, | d- |
 d t  j j d d  d t  j j d d  Sd. d/ j	 d0   t t  D  S(2   sA   Browse through sentences in a file; optionally highlight matches.i   RZ   R[   i   R   i    i   s   .mrgt   encodingR   RS   R   s   .dacts   %8dt   sentences   no treebank available for "%s".s   <font color=red>%s</font>RR   R<   t   &R1   R   s
   Legend:	%ss   	c         s  s1   |  ]' \ } } d  t  j | d  | f Vq d S(   s   <font color=%s>%s</font>t   grayN(   RU   R,   (   R4   R5   R<   (    (    s6   /home/andreas/ai/mscthesis/disco-dop/web/treesearch.pys	   <genexpr>K  s   t   subsetR=   c         3  s   |  ] }   | Vq d  S(   N(    (   R4   RB   (   R[   (    s6   /home/andreas/ai/mscthesis/disco-dop/web/treesearch.pys	   <genexpr>S  s    s   <font color=%s>%s</font>R   s   <a id=prev>prev</a>s8   <a href="browsesents?text=%d&sent=%d%s" id=prev>prev</a>s   <a id=next>next</a>s8   <a href="browsesents?text=%d&sent=%d%s" id=next>next</a>s   browsesents.htmlR   R   t
   totalsentsR9   t   prevlinkt   nextlinkt   chunkt   mintreet   maxtreeR   s/   <h1>Browse through sentences</h1>
<ol>
%s</ol>
s   
c         s  s,   |  ]" \ } } d  | | t  | f Vq d S(   sF   <li><a href="browsesents?text=%d&sent=1&nomorph">%s</a> (%d sentences)N(   R}   (   R4   R5   RZ   (    (    s6   /home/andreas/ai/mscthesis/disco-dop/web/treesearch.pys	   <genexpr>i  s   N()   R   R*   R|   R,   R   R   R}   R   R(   Rr   R2   R-   R   R   R   R   R   t   findallR.   t   rangeR   R   R   R   t   findRZ   RI   Rs   R   R   R   Rl   R3   Rq   Rn   R9   R'   R   R   RU   R   (   R   R   R   R   t   startR   RL   RB   R#   R5   R   t   queryparamsR   R   R<   R^   R   R   t   matchR   R   (    (   R[   s6   /home/andreas/ai/mscthesis/disco-dop/web/treesearch.pyt   browsesents'  sz    (1SM
	%8	s   /browsec          C  s4  d }  d t  j k rd t  j k rt t  j d  } t t  j d  d } t d | | |   } t | |  t |  } d t  j k } d t  j k } t j j t	 t
 | d  } d	 t k rdt	 t
 | d
 } g  t | |  D]v } t t j t j t d	 j | j d | d   d | r+d) n d d | r=d) n d   j d t d t  ^ q }	 n t j j |  rg  t t |  | |  D]9 }
 t t |
 j d  | |   j d t d t  ^ q}	 n t d t
 |   g  t |	 |  D]8 \ } } d | d d t  j k r d n d | f ^ q} d t  j k rOd j |  Sd } | |  k r|d | | |  d f } n  d } | t | |  k  rd | | |  d f } n  t d d | d | d d t
 | d  t | d! | d" | d# | d$ |  d | d | d% | d d& | Sd' d j d(   t t
  D  S(*   s   Browse through trees in a file.i   RZ   R[   i   i    R   R   s   .mrgR   s   .dacts   %8dR   R   R   R   R   R   R   s   no treebank available for "%s".s   <pre id="t%s"%s>%s</pre>t   ajaxs    style="display: none; "RR   s   
s   <a id=prev>prev</a>s1   <a href="browse?text=%d&sent=%d" id=prev>prev</a>s   <a id=next>next</a>s1   <a href="browse?text=%d&sent=%d" id=next>next</a>s   browse.htmlR   R   R   R;   R   R   R   R   R   s+   <h1>Browse through trees</h1>
<ol>
%s</ol>
c         s  s,   |  ]" \ } } d  | | t  | f Vq d S(   sA   <li><a href="browse?text=%d&sent=1&nomorph">%s</a> (%d sentences)N(   R}   (   R4   R5   RZ   (    (    s6   /home/andreas/ai/mscthesis/disco-dop/web/treesearch.pys	   <genexpr>  s   N(   R   R*   R|   R   R   R}   R   R(   Rr   R2   R-   R.   R   R   R   R   R   R   R   R   R'   RZ   RD   R   R   R   R   R   RI   Rs   R   (   R   R   R   R  R   R   R   RL   R5   t
   drawntreesR   R   R#   R   R   (    (    s6   /home/andreas/ai/mscthesis/disco-dop/web/treesearch.pyt   browsen  sJ    XK	s   /favicon.icoc           C  s%   t  t j j t j d  d d d S(   s   Serve the favicon.t   statics   treesearch.icoR8   s   image/vnd.microsoft.icon(   R   R   R(   Rr   t   APPt	   root_path(    (    (    s6   /home/andreas/ai/mscthesis/disco-dop/web/treesearch.pyt   favicon  s    g      @RR   c         C  s  d d | g } d   |  D } i  } t  |  d k rZ | j t | t d d    n  | r{ t |  d |  j d t n |  }	 x} |	 D]u }
 | j d	 |  |
 r t | |  |
 |  n d
 |  |
 r | j |
 d
 d  n d
 t	 j
 |
  |  |
 | f  q W| j d  d j |  S(   s1   A HTML bar plot given a dictionary and max value.s   <div class=barplot>sB   <text style="font-family: sans-serif; font-size: 16px; ">%s</text>c         S  s   h  |  ] } | d   q S(   i    (    (   R4   Rc   (    (    s6   /home/andreas/ai/mscthesis/disco-dop/web/treesearch.pys	   <setcomp>  s   	 i   i   i   Rc   t   reversesC   <br><div style="width:%dpx;" class=b%d></div><span>%s: %g %s</span>i    s   </div>
s   
(   Ru   Rz   R   R   Rw   R,   RD   R{   R`   R   R   Rr   (   R   R   t   titlet   widthR]   Rk   R   Rh   t   colorR   Rc   (    (    s6   /home/andreas/ai/mscthesis/disco-dop/web/treesearch.pyR     s    
"'	%#"c         C  s  d | | f } xut  t |  t  r+ |  n |  g  D]O\ } } | sM q5 n  | r-g  } d } d g t |  d g }	 x t |	 |	 d |	 d  D] \ }
 } } | |
 d k r | | d k r | j d |  q | |
 d k r | } q | | d k r | j d | d | | d f  q q Wn' g  t |  D] } d | d ^ q:} | d | | t j | d	  d
 j |  f 7} q5 W| d S(   s@  Draw a dispersion plot from a list of indices.

	:param indices: a list of sets or Counter objects, where each element is
		a sentence number. Each element of indices will be drawn in a
		different color.
	:param total: the total number of sentences.
	:param runle: use a more compact, run-length encoded representation.s   	<svg version="1.1" xmlns="http://www.w3.org/2000/svg" width="%dpx" height="10px" >
<rect x=0 y=0 width="%dpx" height=10 fill=white stroke=black />
i    ii   i   s
   M %d 0v 10s   M %d 0l 0 10 %d 0 0 -10s:   <g transform="scale(%g, 1)">
<path stroke=%s d="%s" /></g>RT   RR   s   </svg>(	   Rs   t
   isinstanceRm   Rw   R   R{   RU   R,   Rr   (   RV   R   R  t   runleR   R5   RB   t   strokesR  t   seqt   prevt   idxt   nextidx(    (    s6   /home/andreas/ai/mscthesis/disco-dop/web/treesearch.pyR     s,    1* 	$'.c         C  s   t    } d |  k r x t d |  d j d d  j d   D]i } d | k r | j d  \ } } | j d   t t |  t |   D  q> | j t |   q> Wn | j t t	 t
    | S(   s7   Find available texts and parse selected texts argument.R!   t   .t   ,t   -c         s  s   |  ] } | Vq d  S(   N(    (   R4   R5   (    (    s6   /home/andreas/ai/mscthesis/disco-dop/web/treesearch.pys	   <genexpr>  s    N(   R   t   filterR'   R   R   Rz   R   R|   R   Ru   R-   (   R    R/   RB   t   bt   c(    (    s6   /home/andreas/ai/mscthesis/disco-dop/web/treesearch.pyR"     s    	//c         C  s   t    } x d   |  j   D D] } d | k rM | j d d  \ } } n, | d  t |  d k  rl d n d } | } d | k r | j d  \ } } n d | } } | | f | | <q  W| S(	   sU   Return an OrderedDict of names and queries.

	name is abbreviated query if not given.c         s  s!   |  ] } | j    r | Vq d  S(   N(   t   strip(   R4   Ra   (    (    s6   /home/andreas/ai/mscthesis/disco-dop/web/treesearch.pys	   <genexpr>  s    R   i   id   RR   s   ...s   	N(   R	   t
   splitlinesR   Ru   R'   (   R   R   R   R   R<   RO   (    (    s6   /home/andreas/ai/mscthesis/disco-dop/web/treesearch.pyRq     s    	&c   
      C  s  t  j d |   } y t d  } Wn t k
 r; d } n Xt j j | d  rV d S| r |  j d  r | d } t	 j
 d t d  d t d d d	 |  | g d
 t  } | j   d St j j | d  rt	 j
 d t d  d d | d d g d
 t d d d t	 j  } | j } n t j j | d  rNd   t | d  D } nx t j j | d  rd   t j | d  j   D } g  t | d t j D] } | | ^ q} n t d |    t  j d |   } t | d   }	 |	 j |  Wd QXd S(   s9   Create a tokenized copy of a text, one sentence per line.RR   t   uctos   .tokNs   .txtR*   s   -Ls   -ss   -nt   shells   .mrg.t2c.gzR   s   -ts   -ct   *t   bufsizeit   stdouts   .mrgc         s  s+   |  ]! } d  j  t j |   d Vq d S(   RS   s   
N(   Rr   R   R   (   R4   R   (    (    s6   /home/andreas/ai/mscthesis/disco-dop/web/treesearch.pys	   <genexpr>  s   s   .dactc         S  sG   i  |  ]= } t  j | j    j d   j j d  d | j    q S(   R   s   utf-8s   
(   R   R   t   contentsR  RZ   RA   R   (   R4   t   entry(    (    s6   /home/andreas/ai/mscthesis/disco-dop/web/treesearch.pys
   <dictcomp>  s   	Rc   s.   no file found for "%s" and ucto not installed.t   w(   t   EXTRER   R   RI   R'   R   R(   R   t   endswitht
   subprocesst   Popent   LANGR   t   waitt   PIPER$  R   t   alpinocorpust   CorpusReadert   entriesRw   R   t   numbaseR   (
   RL   t   baseR   t   newfilet   proct   tgrept	   convertedR   RB   R   (    (    s6   /home/andreas/ai/mscthesis/disco-dop/web/treesearch.pyt   tokenize  s>    


	/
c         C  s   y d d l  } Wn" t k
 r4 t j j d  i  SXt j t j j t	 d   } i  } t
 |   } xg t |  D]Y } t j j |  } d   | j t t j | d d |  d t j   D | | <qr W| S(	   s?   Get readability of all files and store results in a dictionary.iNso   readability module not found; install with:
pip install https://github.com/andreasvc/readability/tarball/masters   *.tokc         S  s2   i  |  ]( } | j    D] \ } } | |  q q S(    (   Rx   (   R4   R   Rc   t   value(    (    s6   /home/andreas/ai/mscthesis/disco-dop/web/treesearch.pys
   <dictcomp>9  s   	R   s   utf-8t   lang(   t   readabilityt   ImportErrorR
  t   loggert   warningR   R   R(   Rr   R2   R   Rw   t   basenamet   getmeasuresR   R   R   R,  Rn   (   t   numsentsR;  R   R#   t   cutoffRL   R   (    (    s6   /home/andreas/ai/mscthesis/disco-dop/web/treesearch.pyt   getreadabilitymeasures)  s     	c            s  g  }  i  } t  j j t d  } t  j j |  rr y( t j t |   \ }  } } } } Wqr t k
 rn qr Xn  t	 t
 j
 t  j j t d    } t	 t
 j
 t  j j t d    } t
 j
 t  j j t d   }	 x$ | p | p |	 D] }
 t |
  q Wt	 t
 j
 t  j j t d    } | rit |  t | j d d   k rit | d d t | d <n  | rt rt |  t | j d	 d   k rt | d
 d t | d	 <n  | rt |  t | j d d   k rt | d d t | d <n  | s| s| st d t   | ro| rot |  t |  k r`t d   t | |  D  sot d   n  d   t  j j |  rt  j |  j   n  d   | | | D } t |   | k st   f d   | | | D  r|| j d  rg  | D]- }
 |
 j d  r t t |
  j    ^ q } g  | D]0 }
 |
 j d  r:t |
  j   j d  ^ q:} g  | D]6 }
 |
 j d  rwt t j t |
  j     ^ qw} nv| j d	  rg  | d	 j  j!   D] } | j"   ^ q} g  g  } } x+| D] }
 t# j$ |
  } d } } xF | j%   D]8 } | | j&   j d  7} | | j&   j d  7} q1W| j' |  | j' |  t( |
  qWn | j d  r g  | D] }
 t t |
  j    ^ q} g  | D]% }
 d t |
  j   j d  ^ q} g  | D] }
 d ^ q} n t d   g  | p>| p>| D]( } t  j j) t  j j* |   d ^ q?}  t+ |  } n  t j, |  | | | | f t | d  d d |  | | | | | f S(    s.   Get list of files and number of lines in them.s   treesearchcorpus.pickles   *.mrgs   *.dacts   *.txts   *.tokR   s   static/tgrepmacros.txtt
   numthreadsR   s   static/xpathmacros.txtR   s   static/regexmacros.txts8   no files with extension .mrg, .dact, or .txt found in %sc         s  sA   |  ]7 \ } } | j  d  d  d | j  d  d  d k Vq d S(   R  i   i    N(   t   rsplit(   R4   RB   t   t(    (    s6   /home/andreas/ai/mscthesis/disco-dop/web/treesearch.pys	   <genexpr>b  s   sJ   expected either .mrg or .dact files, or corresponding .mrg and .dact filesi    c         S  s2   h  |  ]( } t  j j t  j j |   d   q S(   i    (   R   R(   t   splitextR?  (   R4   RL   (    (    s6   /home/andreas/ai/mscthesis/disco-dop/web/treesearch.pys	   <setcomp>i  s   	c         3  s'   |  ] } t  j |  j   k Vq d  S(   N(   R   t   statt   st_mtime(   R4   RB   (   t   picklemtime(    s6   /home/andreas/ai/mscthesis/disco-dop/web/treesearch.pys	   <genexpr>k  s   s   .mrgt   (s   <node s   word=i   RS   s   no texts found.t   wbt   protocoli(    (    (    (-   R   R(   Rr   R2   R   t   picklet   loadR   RI   Rw   R   R8  R   R,   R   t
   NUMTHREADSt   ALPINOCORPUSLIBR   R   t   AssertionErrorRu   t   allR   RH  RI  t   anyR)  t	   readlinesR   t   countR   R   R   Rn   t   sizeR/  R0  R1  R%  R{   t   printRG  R?  RC  t   dump(   R!   t   corporat
   picklefileRA  t   numconstt   numwordst
   styletablet   tfilest   afilest   txtfilesRL   t   tokfilest   currentfilest   corpusR   t   constRY   R&  RB   (    (   RJ  s6   /home/andreas/ai/mscthesis/disco-dop/web/treesearch.pyt	   getcorpusA  s    "$$$**
!7:C)
(/>c         K  s?   t  j |  t  j j |   } | j |  } | j d  | S(   s9   Pass an iterator to a template; from Flask documentation.i   (   R
  t   update_template_contextt	   jinja_envt   get_templatet   streamt   enable_buffering(   t   template_namet   contextt   templR   (    (    s6   /home/andreas/ai/mscthesis/disco-dop/web/treesearch.pyR     s
    RH   c           B  s   e  Z d  Z d   Z RS(   s(   Convert sets to lists for JSON encoding.c         C  s,   t  | t  r t |  St j j |  |  S(   s   Do conversion.(   R  R   Rm   R7   t   JSONEncodert   default(   t   selft   obj(    (    s6   /home/andreas/ai/mscthesis/disco-dop/web/treesearch.pyRp    s    
(   t   __name__t
   __module__t   __doc__Rp  (    (    (    s6   /home/andreas/ai/mscthesis/disco-dop/web/treesearch.pyRH     s   t   quiett   debugR   t   completet   covert	   quadratict
   complementt   adjacentt   twotermst   nofreqt   approxRV   RC   R   R&   R;   R9   R:   R   i    s   %(asctime)s %(message)st   datefmts   %Y-%m-%d %H:%M:%St   __main__t   hosts   0.0.0.0({   Ru  t
   __future__R    R   R   R   R   RN   R7   R   t   loggingR   R*  t   heapqR   t   urllibR   R   R   t	   itertoolsR   R   t   operatorR   t   collectionsR   R	   R
   t   cPickleRN  R<  Ro   t   flaskR   R   R   R   R   t   werkzeug.urlsR   R/  t   xml.etree.cElementTreet   etreet   cElementTreeR   RD   RQ  R   t   discodop.treedrawR   t   discodopR   R   t   discodop.parserR   t   discodop.treesearchR   R   R   R   R   R   R   R   RF   RP  R,  R2   Rs  R
  t   compilet
   MORPH_TAGSt	   FUNC_TAGSR   R   R(  R3   Rl   Rs   R   RU   t   routeR0   R   R&   R;   R9   R:   RE   R   R   R  R  R  R   R   R"   Rq   R8  RC  Rf  R   Ro  RH   R   Rz   R+   t   basicConfigt	   getLoggerR=  t   logt   setLevelt   DEBUGt   handlerst   setFormattert	   FormatterR-   R}   R~   R   R   R.   t   run(    (    (    s6   /home/andreas/ai/mscthesis/disco-dop/web/treesearch.pyt   <module>   s   

"	
'	$	,+	W6G5$			%		P			$

