package DOPParser; /* * Grammar.java * * Created on 24 november 2005, 16:50 * */ /** * * @author gideon */ import java.util.*; import java.io.*; import java.text.*; /** * a grammar has an HashMap of Terminals * and a HashMap of nonTerminals and their names as hashcode (which contain the production rules inside them * it has methods for parsing, generate_all, generate random string */ public class Grammar { protected HashMap Terminals = new HashMap(); protected HashMap nonTerminals = new HashMap(); protected HashMap> rulesIndexedByUnaryRHS = new HashMap>(); protected HashMap> rulesIndexedByBodyLeftNonT = new HashMap>(); //protected HashMap> rulesIndexedByRHSTwo = new HashMap>(); protected HashMap> myTreeBank = new HashMap>(); public static int nrBinaryBranches = 0, nrTertiaryBranches = 0, nrQuarteryBranches = 0, nrQuinteryBranches = 0, nrSexteryBranches = 0; public static ArrayList quinteryBranchSentences = new ArrayList (); protected static Random rnd = new Random(2096695); /** Creates a new instance of Grammar * from a file of annotated training samples *first it reads the annotated corpus sentence by sentence and creates parseTree objects from it *then it adds the parseTree to the grammar */ public Grammar(boolean blnDummy) throws Exception { //create TOP, and add to list NonTerminal myNonTerminal = new NonTerminal("TOP"); this.nonTerminals.put("TOP", myNonTerminal); BufferedReader buff = null; String mySentence; int treeIndex = 0; parseTree myParseTree = null; ArrayList completeSentences = new ArrayList (); ArrayList unLabeledSentences = new ArrayList (); ArrayList labeledSentences = new ArrayList (); HashMap labeledAndUnlabeledSentences = new HashMap (); ArrayList latexSentences = new ArrayList (); ArrayList spanSentences = new ArrayList (); //String=rule, Integer=rule count HashMap> rulesForPrinting = new HashMap>(); HashMap preterminalRulesForPrinting = new HashMap(); HashSet terminalsForPrinting = new HashSet(); //HashMap rulesForPrinting = new HashMap(); StringBuffer myCompleteSentence = new StringBuffer(); String previousCharacter = ""; //iterate over files of WSJ //"d:/wsj_combined/02 if (Main.BRACKET_FORMAT_WSJ_STYLE) { if (Main.READ_WSJ_TREEBANK_FROM_DIRECTORY) { for (String myFile : Utils.getListOfWSJFiles()) { buff = new BufferedReader(new FileReader(myFile)); System.out.println("Processing file " + myFile + "..." + "; completeSentences.size()=" + completeSentences.size()); while ((mySentence = buff.readLine()) !=null){ //collect sentence over multiple lines and remove spurious spaces if (mySentence.startsWith("(")) { //store previous sentence if (myCompleteSentence.toString() != null && !myCompleteSentence.toString().equals("")) { completeSentences.add(myCompleteSentence.toString().substring(1)); } //beginning of sentence myCompleteSentence = new StringBuffer(); myCompleteSentence.append("+"); previousCharacter = ""; } //go through sentence and remove spurious spaces for (int characterPosition =0; characterPosition < mySentence.length(); characterPosition++) { if (mySentence.substring(characterPosition, characterPosition+1).equals(" ") ) { //only append if not previous is also space if (!previousCharacter.equals(" ")) myCompleteSentence.append(mySentence.substring(characterPosition, characterPosition+1)); } else myCompleteSentence.append(mySentence.substring(characterPosition, characterPosition+1)); previousCharacter = mySentence.substring(characterPosition, characterPosition+1); } } //next line of buffer //write last sentence of buffer if (myCompleteSentence.toString() != null) { completeSentences.add(myCompleteSentence.toString().substring(1)); //without the + } myCompleteSentence = new StringBuffer(); previousCharacter = ""; buff.close(); } //list of files } //if READ_WSJ_DIRECTORY else { //read preprocessed single WSJ-file buff = new BufferedReader(new FileReader(Main.TREEBANK_FILE)); while ((mySentence = buff.readLine()) !=null){ //NOTE: this file has TOP already in it!! so cut first "(TOP " and last bracket off //completeSentences.add(mySentence.substring(5, mySentence.length()-1)); completeSentences.add(mySentence); } } int myCounter = 0, nrTerminals = 0;; System.out.println("At completion completeSentences.size()=" + completeSentences.size()); //extract parses from sentence for (String simpleSentence : completeSentences) { myCounter++; if (myCounter % 500 ==0) System.out.println("##############################" + myCounter + ": " +simpleSentence + "##############################"); //System.out.println(simpleSentence); //remove first and last spurious bracket from sentence //xxx check!!! helaas begint niet altijd met ( (, maar soms ook met ((!!! if (Main.READ_WSJ_TREEBANK_FROM_DIRECTORY) simpleSentence = simpleSentence.substring(1, simpleSentence.length()-1); //if (simpleSentence.startsWith(" ")) simpleSentence = simpleSentence.substring(1); simpleSentence = simpleSentence.trim(); myParseTree = ExtractParseFromWSJText(simpleSentence, false); //myParseTree.printToLatex(false, false, ""); //you still need to remove nonTerminal nodes that have no children myParseTree.removeEmptyNonTerminalNodes(); //you must do this recursively, because empty nodes may stack myParseTree.removeEmptyNonTerminalNodes(); myParseTree.removeEmptyNonTerminalNodes(); //filter out WSJ10 nrTerminals =0; for (Node myNode : myParseTree.getNodes()) { if (myNode.getType()==Main.TERMINAL) { //TERMINALS nrTerminals++; } } if (nrTerminals<=10) { boolean hasQuinteryBranches = false; if (Main.DO_BRANCH_COUNTING) { hasQuinteryBranches = myParseTree.doBranchCounting() ; } if(!hasQuinteryBranches) { /////////////////// PREPARE FOR PRINTING ///////////////// if (Main.PRINT_UNLABELED_FILE || Main.PRINT_LATEX_FILE || Main.PRINT_CLEANEDUP_LABELED_FILE || Main.PRINT_CNF_CONVERTED_LABELED_FILE || Main.PRINT_SPANS || Main.CALCULATE_LIKELIHOOD) { prepareParseTreeForPrinting(myParseTree, unLabeledSentences, labeledSentences, latexSentences, spanSentences); } treeIndex++; //System.out.println("line " + treeIndex + ": " + mySentence); if (Main.EXPORT_CFG_GRAMMAR_TO_FILE) prepareCFGGrammarRulesForPrinting(myParseTree, rulesForPrinting, preterminalRulesForPrinting, terminalsForPrinting); //preprocessParseTreeAndAddToGrammar method includes conversion to CNF else preprocessParseTreeAndAddToGrammar(myParseTree, treeIndex); //in CNF-format here if (Main.PRINT_CNF_CONVERTED_LABELED_FILE) { labeledSentences.add(myParseTree.printWSJFormat()); } if (Main.PRINT_BINARYSPANS) { System.out.println("Sentence #" + myCounter + ": " + simpleSentence); //spanSentences.add(myParseTree.printToLatex(false, true, "")); spanSentences.add(myParseTree.printSpans()); } } //if(!hasQuinteryBranches) } //printToLatex(boolean blnPrintNrSubtrees, boolean blnPrintNodeSpans } //for (String simpleSentence : completeSentences) } //if (Main.BRACKET_FORMAT_WSJ_STYLE) else { //OVIS //buff = new BufferedReader(new FileReader("WSJ_eenzin_labeled.txt")); buff = new BufferedReader(new FileReader(Main.TREEBANK_FILE)); treeIndex =0; while ((mySentence = buff.readLine()) !=null){ treeIndex++; //System.out.println("line " + treeIndex + ": " + mySentence); myParseTree = ExtractParseFromOVISText(mySentence); boolean hasQuinteryBranches = false; if (Main.DO_BRANCH_COUNTING) { hasQuinteryBranches = myParseTree.doBranchCounting() ; } if(!hasQuinteryBranches) { /////////////////// PREPARE FOR PRINTING ///////////////// if (Main.PRINT_UNLABELED_FILE || Main.PRINT_LATEX_FILE || Main.PRINT_CLEANEDUP_LABELED_FILE || Main.PRINT_CNF_CONVERTED_LABELED_FILE || Main.PRINT_SPANS || Main.CALCULATE_LIKELIHOOD) { prepareParseTreeForPrinting(myParseTree, unLabeledSentences, labeledSentences, latexSentences, spanSentences); } if (!Main.EXPORT_CFG_GRAMMAR_TO_FILE) preprocessParseTreeAndAddToGrammar(myParseTree, treeIndex); else prepareCFGGrammarRulesForPrinting(myParseTree, rulesForPrinting, preterminalRulesForPrinting, terminalsForPrinting); //in CNF-format here if (Main.PRINT_CNF_CONVERTED_LABELED_FILE) { labeledSentences.add(myParseTree.printOVISFormat(true)); } if (Main.PRINT_BINARYSPANS) { System.out.println("Sentence #" + treeIndex + ": " + mySentence); myParseTree.printToLatex(false, true, ""); spanSentences.add(myParseTree.printSpans()); } } //if(!hasQuinteryBranches) } buff.close(); } //System.out.println("Before calculateRuleProbabilities"); if (!Main.EXPORT_CFG_GRAMMAR_TO_FILE) calculateRuleProbabilities(); if (Main.CALCULATE_LIKELIHOOD) { calculateLikelihood(myTreeBank, this); } //if (Main.CALCULATE_LIKELIHOOD) /////////////// PRINTING //////////////// if (Main.EXPORT_CFG_GRAMMAR_TO_FILE) Printer.printCFGGrammar(rulesForPrinting, preterminalRulesForPrinting, terminalsForPrinting); if (Main.PRINT_UNLABELED_FILE) Printer.printSentences(unLabeledSentences, "unlabeled.txt"); //write latex text to file if (Main.PRINT_LATEX_FILE) Printer.printSentences(latexSentences, "WSJ_latex.txt"); //write labeled text to file if (Main.PRINT_CLEANEDUP_LABELED_FILE || Main.PRINT_CNF_CONVERTED_LABELED_FILE) Printer.printSentences(labeledSentences, "WSJ_labeled.txt"); if (Main.PRINT_SPANS) Printer.printSentences(spanSentences, "Spanfile.txt"); if (Main.DO_BRANCH_COUNTING) { System.out.println("nrBinaryBranches=" + nrBinaryBranches); System.out.println("nrTertiaryBranches=" + nrTertiaryBranches); System.out.println("nrQuarteryBranches=" + nrQuarteryBranches); System.out.println("nrQuinteryBranches=" + nrQuinteryBranches); System.out.println("nrSexteryBranches=" + nrSexteryBranches); //print parses with quintery branches Printer.printSentences(quinteryBranchSentences, "QuinteryBranchSentences.txt"); } } public static void calculateLikelihood(HashMap> bankOfParses, Grammar myGrammar) throws IOException { //System.out.println("In CALCULATE_LIKELIHOOD"); double totalLogLikelihood = 0d, logLikelihood=0d, sentenceProbability=0d; boolean ruleFound = false; TreeMap> sentencesSortedByLikelihood = new TreeMap>(); HashMap totalProbabilityForCertainLength = new HashMap(); HashMap totalSentencesForCertainLength = new HashMap(); NonTerminal myNodeNonTerminal=null, ChildNonTerminal1=null, ChildNonTerminal2=null; Constituent Child1; parseTree cnfParseTree = null; int sentenceLength; //for (String myLabeledSentence : labeledSentences) { //for (parseTree cnfParseTree : bankOfParses) { int testCounter = 0; //bankOfParses is HM with key=unlabeled sentence and value =parseTree for (String myUnlabeledSentence : bankOfParses.keySet()) { testCounter++; //if (testCounter%200==0) System.out.println("testCounter=" + testCounter + "; sentence=" + myUnlabeledSentence.split("#")[0]); System.out.println ("in calculate likelihood" + myUnlabeledSentence.split("#")[0]); //XXX it is CFG now!!! no! CNF //GenericPair cnfParseTree = bankOfParses.get(myUnlabeledSentence).getSecond(); System.out.println("parseTree=" + cnfParseTree.printWSJFormat()); logLikelihood = 0d; sentenceProbability = 1d; //loop over nodes, and multiply the probabilities; also remember total for (Node myNode : cnfParseTree.getNodes()) { //if (testCounter%200==0) System.out.println("Node=" + myNode.getName()); Rule tempRule; myNodeNonTerminal = myGrammar.nonTerminals.get(myNode.getName()); //you must reconstruct the rule (LHS and RHS) to find it in grammar so you can obtain rule probability if (myNode.getChildNodes().size()==2) { //find the children ChildNonTerminal1 = myGrammar.nonTerminals.get(myNode.getChildNodes().get(0).getName()); ChildNonTerminal2 = myGrammar.nonTerminals.get(myNode.getChildNodes().get(1).getName()); //this rule may not be unique //tempRule = new Rule(myNodeNonTerminal, rhsArrayofConstituents); tempRule = new Rule(myNodeNonTerminal, ChildNonTerminal1, ChildNonTerminal2 ); if (tempRule==null) System.out.println("tempRule==null"); //System.out.println("tempRule=" + myNode.getName() + "-> " + myNode.getChildNodes().get(0).getName() + " >< " + myNode.getChildNodes().get(1).getName() + "<<"); //System.out.println("tempRule=" + myNodeNonTerminal.getName() + "-> " + ChildNonTerminal1.getName() + " >< " + ChildNonTerminal2.getName() + "<<"); ruleFound = false; //enumerate all rules with LHS equal to current node for (Rule existingRule : myNodeNonTerminal.getRules()) { if (tempRule.equals(existingRule)) { //update loglikelihood //if (testCounter%200==0) System.out.println("found rule, prob= " + existingRule.getRuleProbability()); logLikelihood -= Math.log(existingRule.getRuleProbability())/Math.log(2.); //((double) existingRule.getCount()) * sentenceProbability *= existingRule.getRuleProbability(); System.out.println(myNodeNonTerminal.getName() + " --> " + ChildNonTerminal1.getName() + " " + ChildNonTerminal2.getName() + " : " + existingRule.getRuleProbability()); ruleFound = true; break; } } if (!ruleFound) System.out.println("Rule doesnt exist"); } //end if (myNode.getChildNodes().size()==2) //rules with RHS size=1 or TOP rules else { //logLikelihood of preterminals =0??? HELEMAAL NIET!!! //XXX volgens mij alleen TOP -->S met loglikelihood=0: of is S erafgehaald met CNF-conversie? if (myNode.getChildNodes().size()==1 && myNode.getType()==Main.NONTERMINAL) { //&& myNode.getName().equals("TOP") //TOP node or preterminal: in latter case child can be terminal if (myNode.getChildNodes().get(0).getType()==Main.NONTERMINAL) Child1 = myGrammar.nonTerminals.get(myNode.getChildNodes().get(0).getName()); else Child1 = myGrammar.Terminals.get(myNode.getChildNodes().get(0).getName()); tempRule = new Rule(myNodeNonTerminal, Child1); //System.out.println(myNodeNonTerminal.getName() + "-->" + Child1.getName()); ruleFound = false; for (Rule existingRule : myNodeNonTerminal.getRules()) { if (tempRule.equals(existingRule)) { //update loglikelihood logLikelihood -= Math.log(existingRule.getRuleProbability())/Math.log(2.); //((double) existingRule.getCount()) * sentenceProbability *= existingRule.getRuleProbability(); System.out.println(myNodeNonTerminal.getName() + " --> " + myNode.getChildNodes().get(0).getName() + " : " + existingRule.getRuleProbability()); //System.out.println("Parent = TOP; Child=" + myNode.getChildNodes().get(0).getName() + "; logLikeli=" + logLikelihood); ruleFound = true; break; } } if (!ruleFound) System.out.println("Rule doesnt exist"); } //if (size=1 && TOP } //else } //for (Node myNode : cnfParseTree.getNodes()) //store in array cnfParseTree.setProbability(logLikelihood); //cnfParseTree.setProbability(sentenceProbability); //keep TreeSet of 50 parseTrees with highest likelihood //if number of parseTrees in TreeSet >50, then remove lowest probability //mostUnlikelyParses.put(new Double(logLikelihood), myUnlabeledSentence.split("#")[0]); //if (mostUnlikelyParses.size()>50) mostUnlikelyParses.remove(mostUnlikelyParses.firstKey()); //update totalLogLikelihood totalLogLikelihood += logLikelihood; //store the total probability for trees of different lengths sentenceLength = myUnlabeledSentence.split(" ").length -1; //because unlabeled sentence includes " ." if (totalProbabilityForCertainLength.get(sentenceLength) ==null) { //xxx je moet generic class of pairs of objects maken!!! totalProbabilityForCertainLength.put(sentenceLength, logLikelihood); totalSentencesForCertainLength.put(sentenceLength, 1); } else { totalProbabilityForCertainLength.put(sentenceLength, totalProbabilityForCertainLength.get(sentenceLength) + logLikelihood); totalSentencesForCertainLength.put(sentenceLength, totalSentencesForCertainLength.get(sentenceLength)+1); } } //for (String myUnlabeledSentence : bankOfParses.keySet()) { System.out.println("!!!!!!!!!!!!!totalLogLikelihood= " + totalLogLikelihood + "; testCounter=" + testCounter); //now you know probabilities for all parses in the treebank //you must normalize likelihood by dividing by average likelihood of sentences of same length //sort it, and output a sorted file of unlabeled sentences with probabil after comma, and a sorted file of labeled sentences double normalizedLikelihood = 0d; double averageLikelihood = 0d; String labeledSentence = null; //System.out.println("# sentences=" + bankOfParses.size()); for (String myUnlabeledSentence : bankOfParses.keySet()) { sentenceLength = myUnlabeledSentence.split(" ").length-1; averageLikelihood = totalProbabilityForCertainLength.get(sentenceLength)/((double) totalSentencesForCertainLength.get(sentenceLength)); cnfParseTree = bankOfParses.get(myUnlabeledSentence).getSecond(); normalizedLikelihood = cnfParseTree.getProbability() -averageLikelihood; labeledSentence = bankOfParses.get(myUnlabeledSentence).getFirst(); //cnfParseTree.transformBackToCFG(); //if (Main.DO_WSJ_PARSE) labeledSentence = cnfParseTree.printWSJFormat(); //else labeledSentence = cnfParseTree.printOVISFormat(true); if (sentencesSortedByLikelihood.get(normalizedLikelihood)==null) { //create new HashSet for sentences with this probability ArrayList sentencesWithSameLikelihood = new ArrayList(); //.split("#")[0], because random number added at end, so it takes duplicates sentencesWithSameLikelihood.add(myUnlabeledSentence.split("#")[0] + "#" + labeledSentence); sentencesSortedByLikelihood.put(normalizedLikelihood, sentencesWithSameLikelihood); } else { //add pair to HashSet sentencesSortedByLikelihood.get(normalizedLikelihood).add(myUnlabeledSentence.split("#")[0] + "#" + labeledSentence); } } //print a sorted list of unlabeled sentences, and labeled sentences BufferedWriter out_unlabeled = new BufferedWriter(new FileWriter(Main.OUTPUT_DIR + "/" + "sorted_by_likelihood_unlabeled.csv")); BufferedWriter out_labeled = new BufferedWriter(new FileWriter(Main.OUTPUT_DIR + "/" + "sorted_by_likelihood_labeled.txt")); for (Double myLikelihood : sentencesSortedByLikelihood.keySet()) { for (String pairOfSentences : sentencesSortedByLikelihood.get(myLikelihood)) { sentenceLength = pairOfSentences.split("#")[0].split(" ").length-1; averageLikelihood = totalProbabilityForCertainLength.get(sentenceLength)/((double) totalSentencesForCertainLength.get(sentenceLength)); out_unlabeled.write(pairOfSentences.split("#")[0] + ", " + (myLikelihood + averageLikelihood) + ", " + myLikelihood); out_unlabeled.newLine(); out_labeled.write(pairOfSentences.split("#")[1] ); out_labeled.newLine(); } } out_labeled.flush(); out_labeled.close(); out_unlabeled.flush(); out_unlabeled.close(); //print the most likely parses //NumberFormat numberFormatter; //numberFormatter = NumberFormat.getNumberInstance(); //System.out.println("Total Likelihood=" + numberFormatter.format(totalLogLikelihood) + "; The 50 most unlikely sentences are: "); //XXX if there is more than one with exact same likelihood in TOP 50, it is missed, because only unique likelihoods //for (Double myLikelihood : mostUnlikelyParses.keySet()) { // System.out.println(mostUnlikelyParses.get(myLikelihood) + " : likelihood=" + numberFormatter.format(myLikelihood.doubleValue())); //} } public Grammar() { } /* reads in the complete grammar from a file containing the production rules * **/ public Grammar(String readGrammarFromFileName) throws Exception { BufferedReader buff, buffAssociations; String mySentence; //System.out.println("gezocht: " + readGrammarFromFileName); //buff = new BufferedReader(new FileReader("d:/StolckeGrammar_artificial.txt")); buff = new BufferedReader(new FileReader(readGrammarFromFileName)); HashMap labelAssociations = new HashMap(); if (Main.REPLACE_LABELS_IN_GRAMMAR) { //read the file of label associations buffAssociations = new BufferedReader(new FileReader("Input_for_evaluation/labelAssociations.txt")); while ((mySentence = buffAssociations.readLine()) !=null){ //associations are separated by comma's labelAssociations.put(mySentence.split(",")[0], mySentence.split(",")[1]); } } NonTerminal myLHSNonTerminal = null; NonTerminal myNonTerminalB = null; NonTerminal myNonTerminalC = null; String myLHS = null; String originalLHS = null; String myProb = null; Rule myRule = null; //skip first line //buff.readLine(); //first read terminals, create them and add to grammar while (!(mySentence = buff.readLine()).equals("NONTERMINALS")){ this.Terminals.put(mySentence, new Terminal(mySentence)); } //read nonterminals, create them (w/o rules) and add to grammar //skip first line //buff.readLine(); //String myNonT = null; while (!(mySentence = buff.readLine()).equals("PRODUCTION RULES")){ if (Main.REPLACE_LABELS_IN_GRAMMAR) { if (!(labelAssociations.get(mySentence)==null)) { mySentence = labelAssociations.get(mySentence); } //else myNonT = mySentence; } this.nonTerminals.put(mySentence, new NonTerminal(mySentence)); //System.out.println("NonTerminal=" + mySentence); } //read rules, add rules to nonterminals and add them to storeRulesAccordingToRHS //for probabilities, note E notation while ((mySentence = buff.readLine()) !=null){ if (mySentence.indexOf("RULESOFNONTERMINAL") >= 0) { originalLHS = mySentence.substring(19); if (Main.REPLACE_LABELS_IN_GRAMMAR) { if (!(labelAssociations.get(originalLHS)==null)) { originalLHS = labelAssociations.get(originalLHS); } } //System.out.println("myNonTerminalString=" + mySentence.substring(19)); //System.out.println("myNonTerminalFirst=" + myLHSNonTerminal.getName()); } else { String myRHS = mySentence.split("#")[0]; if (Main.REPLACE_LABELS_IN_GRAMMAR) { //replace all nonT in RHS by other labels //break up at * and put together again StringBuffer myRHSBuffer = new StringBuffer(); for (String myLabel : myRHS.split("\\*")) { if (!(labelAssociations.get(myLabel)==null)) { myRHSBuffer.append(labelAssociations.get(myLabel) + "*"); } else myRHSBuffer.append(myLabel + "*"); //if (myLabel.equals("CHNK~_NIET|_DE|_VINK~1063")) System.out.println("CHNK~_NIET|_DE|_VINK~1063 replaced by " + labelAssociations.get(myLabel)); } myRHS = myRHSBuffer.toString(); } //System.out.println("myRHS=" + myRHS); ArrayList completeRules = new ArrayList(); //also check length=2 en X --> Y w boolean blnNonCNF = false; //rules of length 2 cannot have terminals in their RHS if (this.Terminals.containsKey(myRHS.split("\\*")[0])) //if (myRHS.split("\\*").length == 2 && (myRHS.split("\\*")[0].equals(myRHS.split("\\*")[0].toLowerCase()) || myRHS.split("\\*")[1].equals(myRHS.split("\\*")[1].toLowerCase()))) blnNonCNF = true; if (myRHS.split("\\*").length == 2 && (this.Terminals.containsKey(myRHS.split("\\*")[0]) || this.Terminals.containsKey(myRHS.split("\\*")[1]))) blnNonCNF = true; if (myRHS.split("\\*").length > 2 || blnNonCNF) { //non-CNF rule //you need to split the rule into subrules and then loop over each of the subrules //while assigning/dividing correct probabilities //System.out.println("THIS RULE IS NOT IN CNF FORMAT!!! " + originalLHS + "-->" + myRHS); //create parseTree (Root node is automatically created from LHS parseTree myParseTree = new parseTree(originalLHS); Node topNode = myParseTree.getNodes().get(0); Node myChildNode; //create nodes with parents LHS for (String childName : myRHS.split("\\*")) { myChildNode = new Node(childName, topNode); myChildNode.setType(Main.NONTERMINAL); //XXX TEMP SOLUTION if (childName.equals(childName.toLowerCase())) if (this.Terminals.containsKey(childName)) myChildNode.setType(Main.TERMINAL); myParseTree.addNode(myChildNode); //add RHS nodes as children to parent topNode.addChildNode(myChildNode); } //convert rule to CNF format myParseTree.transformToCNF(); //create array of rules, one for each node of parseTree for (Node NodeCNF : myParseTree.getNodes()) { //only iff node has children if (NodeCNF.getChildNodes().size()>0) { //create nonTerminals for CNF-forms (but w/o rules) if (NodeCNF.getName().contains("X_")) { if (!this.nonTerminals.containsKey(NodeCNF.getName())){ //create new NonTerminal NonTerminal newNonTerminal = new NonTerminal(NodeCNF.getName()); this.nonTerminals.put(NodeCNF.getName(), newNonTerminal); } } //reconstruct rule, with correct RHS and prob StringBuffer recreatedRule = new StringBuffer(); //LHS recreatedRule.append(NodeCNF.getName()).append("#"); //children for (Node childNode : NodeCNF.getChildNodes()) { recreatedRule.append(childNode.getName()).append("*"); } //assign correct probabilities recreatedRule.append("#"); //all rules with X in LHS get probability 1 if (NodeCNF.getName().contains("X_")) recreatedRule.append("1."); else recreatedRule.append(mySentence.split("#")[1]); //add to completeRules completeRules.add(recreatedRule.toString()); //System.out.println("recreatedRule=" + recreatedRule.toString()); } } } else { //rewrite binary or unary rule such that it includes LHS completeRules.add(originalLHS + "#" + myRHS + "#" + mySentence.split("#")[1]); //System.out.println("originalLHS +# + mySentence=" + originalLHS + "#" + mySentence); } //now we got everything in CNF format (hopefully) for (String completeRule : completeRules) { myLHS = completeRule.split("#")[0]; myRHS = completeRule.split("#")[1]; //System.out.println("completeRule="+completeRule + "; myLHS=" + myLHS + "STOP"); //if (myRHS==null) System.out.println(completeRule); myProb = completeRule.split("#")[2]; myLHSNonTerminal = this.nonTerminals.get(myLHS); if (myLHSNonTerminal ==null) System.out.println("myLHSNonTerminal ==null"); //TEMP SOLUTION if (myRHS.equals("*")) continue; String leftBodyNonT = myRHS.split("\\*")[0]; String rightBodyNonT = null; //System.out.println("completeRule=" + completeRule + "; myLHS=" + myLHS + "; myRHS=" + myRHS + "; myProb=" + myProb + "; myLHSNonTerminal=" + myLHSNonTerminal.getName() ); if (myRHS.split("\\*").length == 2) { //binary rule rightBodyNonT = myRHS.split("\\*")[1]; //System.out.println("length=2: " + myRHS.split("\\*")[0]); //System.out.println(leftBodyNonT + "= " + this.nonTerminals.get(leftBodyNonT) + "; rightBodyNonT: " + rightBodyNonT + "= " + this.nonTerminals.get(rightBodyNonT)); myRule = new Rule(myLHSNonTerminal, this.nonTerminals.get(leftBodyNonT), this.nonTerminals.get(rightBodyNonT)); } else { //unary rules : check if RHS is terminal or nonterminal if(this.Terminals.get(leftBodyNonT)!=null) { //terminal //System.out.println("Terminal=" + myRHS.split("\\*")[0]); //System.out.println("idem=" + this.Terminals.get(myRHS.split("\\*")[0]).getName()); myRule = new Rule(myLHSNonTerminal, this.Terminals.get(leftBodyNonT)) ; } else { //System.out.println("NonTerminal=" + myRHS.split("\\*")[0]); //System.out.println("idem=" + this.nonTerminals.get(myRHS.split("\\*")[0]).getName()); myRule = new Rule(myLHSNonTerminal, this.nonTerminals.get(leftBodyNonT)) ; } } //add probs double ruleProb = 0d; if (myProb.contains("E")) { ruleProb = java.lang.Double.parseDouble(myProb.split("E")[0])*Math.pow(10d, java.lang.Double.parseDouble(myProb.split("E")[1])); } else ruleProb = java.lang.Double.parseDouble(myProb); myRule.setRuleProbability(ruleProb); //add rule to nonTerminal //System.out.println("myLHSNonTerminal=" + myLHSNonTerminal.getName()); myLHSNonTerminal.addRule(myRule); //add to rulesIndexedByBodyLeftNonT if (rightBodyNonT != null) { //you only want to store binary rules if (rulesIndexedByBodyLeftNonT.get(leftBodyNonT) == null) { //you need to create a new HashSet of Rules and add it to HashMap of rulesIndexedByUnaryRHS HashSet myRuleSet = new HashSet(); myRuleSet.add(myRule); this.rulesIndexedByBodyLeftNonT.put(leftBodyNonT, myRuleSet); } else { //System.out.println("adding Rule " + myLHS + " --> " + leftBodyNonT + " " + rightBodyNonT); for (Constituent myConstituent : myRule.getRightHandSide()) { //System.out.println(myConstituent.getName()); } //System.out.println("MyRHS=" + myRule.getRightHandSide() //get reference to HashSet of rules with the same leftBodyNonT //adding to HashSet invokes equals method of Rule rulesIndexedByBodyLeftNonT.get(leftBodyNonT).add(myRule); } } else { //unary RHS //add to rulesIndexedByUnaryRHS if (rulesIndexedByUnaryRHS.get(myRHS) == null) { //you need to create a new HashSet of Rules and add it to HashMap of rulesIndexedByUnaryRHS HashSet myRuleSet = new HashSet(); myRuleSet.add(myRule); this.rulesIndexedByUnaryRHS.put(myRHS, myRuleSet); } else { rulesIndexedByUnaryRHS.get(myRHS).add(myRule); } } } } } /* temp int matchingConstituents = doPARSEVAL(bankOfParses.get(0), myTreeBank.get(1)); System.out.println("matchingConstituents=" + matchingConstituents); System.out.println("nrConstituentsTree1=" + myTreeBank.get(0).getNodes().size()); System.out.println("nrConstituentsTree2=" + myTreeBank.get(1).getNodes().size()); */ } public void prepareParseTreeForPrinting(parseTree myParseTree, ArrayList unLabeledSentences, ArrayList labeledSentences, ArrayList latexSentences, ArrayList spanSentences) { //System.out.println( "in prepareParseTreeForPrinting"); //write it into stringarray StringBuffer unLabeledSentence = new StringBuffer(); // int nrTerminals = 0; for (Node myNode : myParseTree.getNodes()) { if (myNode.getType()==Main.TERMINAL) { //TERMINALS //nrTerminals++; unLabeledSentence.append(myNode.getName().split(" ")[0]).append(' '); } } unLabeledSentence.append("."); unLabeledSentences.add(unLabeledSentence.toString()); //still in CFG-format here if (!Main.PRINT_CNF_CONVERTED_LABELED_FILE) { if(Main.BRACKET_FORMAT_WSJ_STYLE) labeledSentences.add(myParseTree.printWSJFormat()); else labeledSentences.add(myParseTree.printOVISFormat(true)); } //labeledAndUnlabeledSentences.put(unLabeledSentence.toString(), myParseTree.printWSJFormat()); //remember parseTrees GenericPair myPair; if(Main.BRACKET_FORMAT_WSJ_STYLE) myPair = new GenericPair(myParseTree.printWSJFormat(), myParseTree); else myPair = new GenericPair(myParseTree.printOVISFormat(true), myParseTree); //myParseTree is still CFG, but if you keep reference to it, it will turn later into CNF myTreeBank.put(unLabeledSentence.toString() + "#" + Math.random(), myPair); if (Main.PRINT_LATEX_FILE || Main.PRINT_SPANS) { myParseTree.calculateNodeDepthAndLabelNodes(); myParseTree.calculateNodeSpans(); //before CNF-conversion if (!Main.PRINT_BINARYSPANS) { latexSentences.add(myParseTree.printToLatex(false, true, "")); spanSentences.add(myParseTree.printSpans()); } } ///////////////////////////////////////////////////////// //print the parseTree in Latex format (on screen) //myParseTree2.printToLatex(false, false); } public void preprocessParseTreeAndAddToGrammar(parseTree myParseTree, int treeIndex){ //temp for printing only: calculate depth of nodes and label them //myParseTree2.calculateNodeDepthAndLabelNodes(); //temp for printing latex only: calculate spans //myParseTree2.calculateNodeSpans(); //extract unlabeled sentence StringBuffer unLabeledSentence = new StringBuffer(); //CNF conversion //System.out.println("CNF conversion"); myParseTree.transformToCNF(); //calculate depth of nodes and label them myParseTree.calculateNodeDepthAndLabelNodes(); myParseTree.calculateNodeSpans(); //myParseTree.printToLatex(false, true, ""); if (Main.DOGOODMANDOP) { //count the subtrees, start on the deepest level, and go up //store nr of subtrees in every node myParseTree.calculateSubTrees_generalized(); //print the CNF conversed parseTree in Latex format (including nr subtrees) //myParseTree2.printToLatex(true, false); //add terminals, nonterminals and rules to grammar //NB if you haven't converted to CNF you will get error here addParseTreeToGrammar(myParseTree, treeIndex); } else addParseTreeToGrammarNoGoodman(myParseTree); //temp test //if (this.nonTerminals.containsKey("X_7963")) { // System.out.println("X_7963 is in list of nonT" ); // for (Rule myRule : this.nonTerminals.get("X_7963").getRules()) { // System.out.println("X_7963-->" + myRule.getRightHandSide()); // } //} //test convert back to CFG //myParseTree2.transformBackToCFG(); //System.out.println("The parse tree converted back to CFG:"); //myParseTree2.printToLatex(true, false); } public String[] Expand(String[] currentSentence, int branch){ String[] Constituents = new String[100]; //todo return Constituents; } public String[] GenerateRandomSentence(int maxDepth, int minLength, int maxLength) { ArrayList mySentence = new ArrayList(); int depth = 0; int nrOfTerminals = 0; int nrOfTrials = 0; while ((nrOfTerminals < minLength || nrOfTerminals > maxLength) && nrOfTrials < 50){ nrOfTrials++; //start with TOP mySentence.clear(); mySentence.add(this.nonTerminals.get("TOP") ); while(depth <= maxDepth && nrOfTerminals == 0){ depth++; mySentence = GetRandomSuccessor(mySentence); nrOfTerminals = CheckIfAllTerminals(mySentence); } } //check again if sentence contains only terminals (in case thrown out because exceeds depth if (CheckIfAllTerminals(mySentence)>0) { //convert terminals to strings ArrayList outputsentence = new ArrayList(); for (Constituent myConstituent : mySentence){ Terminal tempTerminal = (Terminal) myConstituent; outputsentence.add(tempTerminal.getName()); //outputsentence[k] = tempTerminal.getName(); //k++; } return outputsentence.toArray(new String[0]) ; } else return null; } /* if only Terminals then returns nr of Terminals * otherwise returns 0\ */ public int CheckIfAllTerminals(ArrayList inputsentence){ int nrOfTerminals = 0; for (Constituent myConstituent : inputsentence){ if(myConstituent instanceof NonTerminal) return 0; else nrOfTerminals++; } return nrOfTerminals ; } public ArrayList GetRandomSuccessor(ArrayList inputsentence){ ArrayList outputSentence = new ArrayList(); //iterate over constituents and find nonTerminals for (Constituent myConstituent : inputsentence){ if(myConstituent instanceof NonTerminal){ //find out how many branches/rules NonTerminal has NonTerminal tempConstituent = nonTerminals.get(myConstituent.getName()); //pick a random nr between 0 and total nr of rules of NonTerminal //??? klopt dit? //if(tempConstituent.countRules() == 0){ // System.out.println("Problem with: " + tempConstituent.getName()); //} assert(tempConstituent.countRules() > 0); int randomRule = rnd.nextInt(tempConstituent.countRules()); int iCheck = tempConstituent.countRules(); //get the rule //assert(randomRule < tempConstituent.countRules()); for (Constituent insertConstituent : tempConstituent.getRule(randomRule).getRightHandSide() ){ outputSentence.add(insertConstituent); } } else outputSentence.add(myConstituent); } return outputSentence; } public String[] GetListofTerminals(){ //String[] myTerminalArray = new String[100]; ArrayList myTerminalArrayList = new ArrayList(); //int i = 0; for(String key : this.Terminals.keySet()) { myTerminalArrayList.add(key); } return myTerminalArrayList.toArray(new String[0]); } public String[] GetListofNonTerminals(){ //String[] myNonTerminalArray = new String[100]; ArrayList myNonTerminalArrayList = new ArrayList(); //int i = 0; for(String key : this.nonTerminals.keySet()) { //also add count of subtrees myNonTerminalArrayList.add(key); // + " (" + this.nonTerminals.get(key).getnrSubtrees() + ") " } return myNonTerminalArrayList.toArray(new String[0]) ; } public ArrayList> GetListofRulesPlusProbabilities(String myLHS){ ArrayList> ArrayListofRules = new ArrayList>(); //??? er zijn misschien nonTerminals zonder rule: maar dat mag niet ArrayList myAssociatedRules = this.nonTerminals.get(myLHS).getRules(); for (Rule myRule : myAssociatedRules){ ArrayList RuleArray = new ArrayList(); ArrayListofRules.add(RuleArray); //populate list of rules RuleArray.add(myLHS); //add RHS for (Constituent myConstituent : myRule.getRightHandSide() ){ if (myConstituent==null) System.out.println("Nullpointer: LHS=" + myRule.getLeftHandSide().getName()); RuleArray.add(myConstituent.getName()); } //add probability NumberFormat numberFormatter; numberFormatter = NumberFormat.getNumberInstance(); String myProb = "(" + numberFormatter.format(myRule.getRuleProbability()) + ")"; //String myProb = "(" + myRule.getRuleProbability() + ")"; RuleArray.add(myProb); } return ArrayListofRules; } /** Extracts the rules from the annotated OVIS file * @param sentence one sentence of OVIS file */ protected static parseTree ExtractParseFromOVISText(String sentence){ //(a,[(tv,[(nee_,[])]),(vp,[(v,[(dank_,[])]),(per,[(u_,[])])])]). //create temporary sentence with node structure //TOP node is automatically created parseTree myParseTree = new parseTree(); int characterPosition = 0; int wordPosition = 0; //currentNode is the index of the node in the parseTree (ArrayList of nodes) //set currentNode to TOP Node currentNode = myParseTree.getNode(myParseTree.getNodes().size()-1); //System.out.println(sentence); //turn sentence into array of characters char[] mySentence = sentence.toCharArray(); StringBuffer mySymbol; while ( characterPosition < sentence.length() -1) { //without ending . // one of three possibilities: // if current character is (, then a new rule starts --> move till after [ // if current character is ], then the rule ends --> move till after ) // otherwise, if next character is comma, then stay within the same rule --> move behind comma // 1) start of new rule: if (mySentence[characterPosition] == '(' ) { characterPosition++; //pass the ( //System.out.println(mySentence[characterPosition]); //get name of symbol, or LHS of rule mySymbol = new StringBuffer(); while (mySentence[characterPosition] != ',') { mySymbol.append(mySentence[characterPosition]); //System.out.println(mySentence[characterPosition]); characterPosition++; } //add the symbol to rule of dominant node //NB at instantiation of myParseTree the first current rule with LHS TOP is automatically created //make new node out of mySymbol, which is daughter of currentNode //currentNode is pointer to parent node (xxx) Node myNode = new Node(new String(mySymbol), currentNode); myParseTree.addNode(myNode); //add this node as a child to parent node currentNode.addChildNode(myNode); //position is now on comma, move ahead till you get behind [ characterPosition++; //pass the comma //System.out.println(mySentence[characterPosition]); characterPosition++; //pass the left bracket of RHS [ //System.out.println(mySentence[characterPosition]); if (mySentence[characterPosition-3] == '_' ) { //it was a terminal node, move ahead till after ) myNode.setType(Main.TERMINAL); //add the word position info in leftSpan and rightSpan fields wordPosition++; myNode.setLeftSpan(wordPosition-1); myNode.setRightSpan(wordPosition); characterPosition++; //pass the ] //System.out.println(mySentence[characterPosition]); characterPosition++; //pass the ) //System.out.println(mySentence[characterPosition]); } else { //set type to nonTerminal myNode.setType(Main.NONTERMINAL); //make this node the currentNode node currentNode = myNode; } } //end of RHS of rule // else { if (mySentence[characterPosition] == ']') { //this is the end of all the sister nodes //position is now on ], move on till behind the ) characterPosition++; //pass the ] //System.out.println(mySentence[characterPosition]); characterPosition++; //pass the ) //System.out.println(mySentence[characterPosition]); //update currentNode to parent of current currentNode = currentNode.getParentNode(); } //stay with same node if(mySentence[characterPosition] == ',') { characterPosition++; //pass the comma //System.out.println(mySentence[characterPosition]); } } /*test for (Node myNode : myParseTree.getNodes()) { System.out.println(myNode.getName() + " --> " + myNode.getChildNodes()); } */ return myParseTree; } /** Extracts the rules from the annotated OVIS file * @param sentence one sentence of OVIS file */ protected static parseTree ExtractParseFromWSJText(String sentence, boolean blnPrintChars){ //System.out.println("in ExtractParseFromWSJText; blnPrintChars=" + blnPrintChars + "; sentence=>>" + sentence + "<<"); //(a,[(tv,[(nee_,[])]),(vp,[(v,[(dank_,[])]),(per,[(u_,[])])])]). //(S (NP (POSTAG hallo))) //( (S (NP-SBJ (NNP Ms.) (NNP Waleson) ) (VP (VBZ is) (NP (NP (DT a) (JJ free-lance) (NN writer) ) (VP (VBN based) (NP (-NONE- *) ) (PP-LOC-CLR (IN in) (NP (NNP New) (NNP York) ))))) (. .) )) //create temporary sentence with node structure //TOP node is automatically created parseTree myParseTree = new parseTree(); int characterPosition = 0; int wordPosition = 0; //replace the first occurrence of S by TOP if ((Main.DO_LABEL_ASSOCIATION || Main.DIRECT_EVALUATION_NO_PARSING) && sentence.startsWith("(top (s ")) sentence = "(top " + sentence.substring(8, sentence.length()-2); //System.out.println(sentence); //currentNode is the index of the node in the parseTree (ArrayList of nodes) //set currentNode to TOP, which is created in constructor of parseTree Node currentNode = myParseTree.getNode(myParseTree.getNodes().size()-1); //XXX maar er zit ook TOP in de sentence!!! //turn sentence into array of characters char[] mySentence = sentence.toCharArray(); StringBuffer mySymbol; while ( characterPosition < sentence.length()) { //(np,[(capelle,[(capelle_,[])]),(aan,[(aan_,[])]),(den,[(den_,[])]),(ijssel,[(ijssel_,[])])]). //(a lorillard spokewoman (said)) ( (this) (is (an old story))) //(TOP (S (NP-SBJ (dt a) (nnp lorillard) (nn spokewoman)) (VP (vbd said) (S (NP-SBJ (dt this)) (VP (vbz is) (NP-PRD (dt an) (jj old) (nn story))))))) // one of three possibilities: // if current character is (, then a new Node starts --> move till after ( // if current character is ), then the rule ends --> move till after ) // otherwise, if next character is space, then stay within the same rule --> move behind space // // 1) start of new rule: if (mySentence[characterPosition] == '(' ) { if (blnPrintChars) System.out.println(mySentence[characterPosition]); characterPosition++; //pass the ( //get name of symbol, or LHS of rule mySymbol = new StringBuffer(); // (s,[(per,[(ik_,[])]),(vp,[(v,[(wil_,[])]),(mp,[(p,[(naar_,[])]),(np,[(np,[(den,[(den_,[])]),(haag,[(haag_,[])])]),(np,[(np,[(centraal_,[])]),(n,[(station_,[])])])])])])]). while (mySentence[characterPosition] != ' ') { mySymbol.append(mySentence[characterPosition]); if (blnPrintChars) System.out.println(mySentence[characterPosition]); characterPosition++; } //only if not again TOP String mySymbolString = mySymbol.toString(); //System.out.println("mySymbolString=" + mySymbolString); if (!mySymbolString.toUpperCase().equals("TOP") ) { //add the symbol to rule of dominant node //NB at instantiation of myParseTree the first current rule with LHS TOP is automatically created //take off the -SBJ etc //if (Main.TAKE_OFF_SBJ && mySymbolString.contains("-")) mySymbolString = mySymbolString.split("-")[0]; //make new node out of mySymbol, which is daughter of currentNode //currentNode is pointer to parent node (xxx) Node myNode = new Node(mySymbolString, currentNode); myParseTree.addNode(myNode); //add this node as a child to parent node currentNode.addChildNode(myNode); //position is now on space, move ahead till you get behind space if (blnPrintChars) System.out.println(mySentence[characterPosition]); characterPosition++; //pass the space //last word is a Terminal unless current character = ( if (mySentence[characterPosition] != '(' ) { //it was a terminal node, move ahead till after ) myNode.setType(Main.TERMINAL); //add the word position info in leftSpan and rightSpan fields wordPosition++; myNode.setLeftSpan(wordPosition-1); myNode.setRightSpan(wordPosition); //find the name of the terminal and replace the label of the node //continue till ) mySymbol = new StringBuffer(); while (mySentence[characterPosition] != ')') { mySymbol.append(mySentence[characterPosition]); if (blnPrintChars) System.out.println(mySentence[characterPosition]); characterPosition++; } //replace label, unless it is one of (, . : `` '' -LRB- or -RRB-), in which case you delete it!!! if (myNode.getName().equals(",") || myNode.getName().equals(".") || myNode.getName().equals(":") || myNode.getName().equals("``") || myNode.getName().equals("''") || myNode.getName().equals("-LRB-") || myNode.getName().equals("-RRB-") || myNode.getName().equals("$") || myNode.getName().equals("#") || myNode.getName().equals("-NONE-") ) { //remove reference in parent Node currentNode.getChildNodes().remove(myNode); //delete node (cross fingers that it works) myParseTree.getNodes().remove(myNode); wordPosition--; } else if (!Main.EXTRACT_POSTAGS) { //if you don't use POSTAGS then replace POSTAGS by lexical items //myNode.replaceLHS(mySymbol.toString()); myNode.replaceLHS((myNode.getName() + " " + mySymbol.toString()).toLowerCase()); } if (blnPrintChars) System.out.println(mySentence[characterPosition]); characterPosition++; //pass the ) } else { //set type to nonTerminal myNode.setType(Main.NONTERMINAL); //make this node the currentNode node currentNode = myNode; } } //it was not TOP again else { if (blnPrintChars) System.out.println(mySentence[characterPosition]); characterPosition++; //it was TOP, pass the space } } //end of RHS of rule if (mySentence[characterPosition] == ')') { //this is the end of all the sister nodes //position is now on ), move on till behind the ) if (blnPrintChars) System.out.println(mySentence[characterPosition]); characterPosition++; //pass the ) //update currentNode to parent of current currentNode = currentNode.getParentNode(); } //stay with same node if (characterPosition< sentence.length()) { if(mySentence[characterPosition] == ' ') { if (blnPrintChars) System.out.println(mySentence[characterPosition]); characterPosition++; //pass the space } } } //test //System.out.println(mySentence); /* for (Node myNode : myParseTree.getNodes()) { ArrayList childnodes = new ArrayList(); for (Node childNode : myNode.getChildNodes()) { childnodes.add(childNode.getName()); } //System.out.println(myNode.getName() + " --> " + myNode.getChildNodes()); System.out.println(myNode.getName() + " --> " + childnodes); } */ return myParseTree; } //[.{TOP} [.{S} [.{NP-SBJ} {Ms.} {Waleson} ] [.{VP} {is} [.{NP} [.{NP} {a} {free-lance} {writer} ] [.{VP} {based} [.{NP} ] [.{PP-LOC-CLR} {in} [.{NP} {New} {York} ] ] ] ] ] ] ] //[.{TOP} [.{S} [.{NP-SBJ} {Ms.} {Waleson} ] [.{VP} {is} [.{NP} [.{NP} {a} {free-lance} {writer} ] [.{VP} {based} [.{NP} ] [.{PP-LOC-CLR} {in} [.{NP} {New} {York} ] ] ] ] ] ] ] /* * enumerate the nodes of the parseTree, start with deepest level * add Terminals and NonTerminals to grammar, and create 8 rules for each NonTerminal node //add them to grammar with appropriate probabilities */ public void addParseTreeToGrammar(parseTree myParseTree, int treeIndex){ NonTerminal exteriorNonTerminal, interiorNonTerminal; Terminal myTerminal; //start with deepest level and climb up the tree for (int currentDepth = myParseTree.deepestLevel; currentDepth >=0; currentDepth--) { for (Node myNode : myParseTree.getNodes()) { //LOOP over all nodes, including terminal nodes if (myNode.getDepth() == currentDepth) { String myNodeName = myNode.getName(); //System.out.println(""); //System.out.println("XXXXXX depth=" + currentDepth + ", node=" + myNodeName); //first part: create Terminals or nonTerminals for the grammar if (myNode.getType()==Main.TERMINAL) { //TERMINALS //all terminals are converted to lower case myNodeName = myNodeName.toLowerCase(); //if Terminal is not yet in Arraylist of grammar then add it if (!this.Terminals.containsKey(myNodeName)) { //turn node into terminal and add to grammar myTerminal = new Terminal(myNodeName); this.Terminals.put(myNodeName, myTerminal); } else myTerminal = this.Terminals.get(myNodeName); } else { //NONTERMINALS //create an EXTERIOR NonTerminal, but first check if it already exists if (!this.nonTerminals.containsKey(myNodeName)){ //create exterior NonTerminal exteriorNonTerminal = new NonTerminal(myNodeName); //add nr of subtrees exteriorNonTerminal.setnrSubtrees(myNode.getnrSubtrees()); //add it to HashMap of nonTerminals of the grammar this.nonTerminals.put(myNodeName, exteriorNonTerminal); } else { //return reference exteriorNonTerminal = this.nonTerminals.get(myNodeName); //compute nr of subtrees for external nonTerminals exteriorNonTerminal.setnrSubtrees(exteriorNonTerminal.getnrSubtrees() + myNode.getnrSubtrees()); } //create an INTERIOR (labeled) nonTerminal, but not for TOP //if (!(myNodeName.equals("TOP"))){ String interiorNodeName = myNodeName + "@" + myNode.getLabel() + "@" + treeIndex; interiorNonTerminal = new NonTerminal(interiorNodeName); //add nr of subtrees interiorNonTerminal.setnrSubtrees(myNode.getnrSubtrees()); //add it to HashMap of nonTerminals of the grammar this.nonTerminals.put(interiorNodeName, interiorNonTerminal); //} //PART 2: add rules - only if current node is NONTERMINAL //ArrayList rhsArrayofConstituents = new ArrayList(); Rule tempRule; //if current node is preterminal then no labels for children if (myNode.getChildNodes().size()==1 && !(myNode.getName().equals("TOP"))) { //PRETERMINAL OR TOP!! //add two rules: one for exterior and one for interior //??? what are probabilities for these??? String daughterTerminalName = myNode.getChildNodes().get(0).getName().toLowerCase(); Terminal daughterTerminal = this.Terminals.get(daughterTerminalName); //add one rule to interior nonTerminal: this is unique rule interiorNonTerminal.addRule(new Rule(interiorNonTerminal, daughterTerminal)); //for exterior nonTerminal, check if rule already exists Rule tempRule1 = new Rule(exteriorNonTerminal, daughterTerminal); boolean bExists = false; for (Rule existingRule : exteriorNonTerminal.getRules()) { if (tempRule1.equals(existingRule)) { bExists = true; } } if (!bExists ){ exteriorNonTerminal.addRule(tempRule1); } } else { //TWO NONTERMINAL CHILDREN or TOP if (myNode.getChildNodes().size()==2) { //find the 4 children: 2 exterior and 2 interior nonTerminals (nodes themselves are unlabeled, only nonterminals are labeled) String myChildNodeName = myNode.getChildNodes().get(0).getName() + "@" + myNode.getChildNodes().get(0).getLabel() + "@" + treeIndex; NonTerminal interiorChild1 = this.nonTerminals.get(myChildNodeName); //System.out.println("interiorChild1=" + myChildNodeName); //System.out.println(interiorChild1.getName()); myChildNodeName = myNode.getChildNodes().get(0).getName(); NonTerminal exteriorChild1 = this.nonTerminals.get(myChildNodeName); myChildNodeName = myNode.getChildNodes().get(1).getName() + "@" + myNode.getChildNodes().get(1).getLabel() + "@" + treeIndex; NonTerminal interiorChild2 = this.nonTerminals.get(myChildNodeName); myChildNodeName = myNode.getChildNodes().get(1).getName(); //System.out.println("interiorChild2=" + myChildNodeName); //System.out.println(interiorChild2.getName()); NonTerminal exteriorChild2 = this.nonTerminals.get(myChildNodeName); //create 4 RightHandSides for the rules, //each of them are attached to both interior and exterior nonTerminal (LHS) //rhsArrayofConstituents.clear(); //rhsArrayofConstituents.add(interiorChild1); //rhsArrayofConstituents.add(interiorChild2); //two unique rules //myNodeNonTerminal.addRule(new Rule(myNodeNonTerminal, rhsArrayofConstituents)); //interiorNonTerminal.addRule(new Rule(interiorNonTerminal, rhsArrayofConstituents)); exteriorNonTerminal.addRule(new Rule(exteriorNonTerminal, interiorChild1, interiorChild2 )); interiorNonTerminal.addRule(new Rule(interiorNonTerminal, interiorChild1, interiorChild2 )); //System.out.println("PRINT RULES"); //System.out.println(myNodeNonTerminal.getName() + " --> " + interiorChild1.getName() + " " + interiorChild2.getName()); //System.out.println(interiorNonTerminal.getName() + " --> " + interiorChild1.getName() + " " + interiorChild2.getName()); //rhsArrayofConstituents.clear(); //rhsArrayofConstituents.add(interiorChild1); //rhsArrayofConstituents.add(ChildNonTerminal2); //two unique rules //myNodeNonTerminal.addRule(new Rule(myNodeNonTerminal, rhsArrayofConstituents)); //interiorNonTerminal.addRule(new Rule(interiorNonTerminal, rhsArrayofConstituents)); exteriorNonTerminal.addRule(new Rule(exteriorNonTerminal, interiorChild1, exteriorChild2 )); interiorNonTerminal.addRule(new Rule(interiorNonTerminal, interiorChild1, exteriorChild2 )); //System.out.println(myNodeNonTerminal.getName() + " --> " + interiorChild1.getName() + " " + ChildNonTerminal2.getName()); //System.out.println(interiorNonTerminal.getName() + " --> " + interiorChild1.getName() + " " + ChildNonTerminal2.getName()); //rhsArrayofConstituents.clear(); //rhsArrayofConstituents.add(ChildNonTerminal1); //rhsArrayofConstituents.add(interiorChild2); //two unique rules //myNodeNonTerminal.addRule(new Rule(myNodeNonTerminal, rhsArrayofConstituents)); //interiorNonTerminal.addRule(new Rule(interiorNonTerminal, rhsArrayofConstituents)); exteriorNonTerminal.addRule(new Rule(exteriorNonTerminal, exteriorChild1, interiorChild2 )); interiorNonTerminal.addRule(new Rule(interiorNonTerminal, exteriorChild1, interiorChild2 )); //System.out.println(myNodeNonTerminal.getName() + " --> " + ChildNonTerminal1.getName() + " " + interiorChild2.getName()); //System.out.println(interiorNonTerminal.getName() + " --> " + ChildNonTerminal1.getName() + " " + interiorChild2.getName()); //rhsArrayofConstituents.clear(); //rhsArrayofConstituents.add(ChildNonTerminal1); //rhsArrayofConstituents.add(ChildNonTerminal2); //unique rule //interiorNonTerminal.addRule(new Rule(interiorNonTerminal, rhsArrayofConstituents)); interiorNonTerminal.addRule(new Rule(interiorNonTerminal, exteriorChild1, exteriorChild2 )); //System.out.println(interiorNonTerminal.getName() + " --> " + ChildNonTerminal1.getName() + " " + ChildNonTerminal1.getName()); //this rule may not be unique //tempRule = new Rule(myNodeNonTerminal, rhsArrayofConstituents); tempRule = new Rule(exteriorNonTerminal, exteriorChild1, exteriorChild2 ); //System.out.println("RPPPP=" + myNodeNonTerminal.getName() + "-->" + ChildNonTerminal1.getName() + " " + ChildNonTerminal2.getName()); boolean bExists = false; for (Rule existingRule : exteriorNonTerminal.getRules()) { if (tempRule.equals(existingRule)) { bExists = true; } } if (!bExists ) exteriorNonTerminal.addRule(tempRule); } else { //TOP node String myChildNodeName = myNode.getChildNodes().get(0).getName() + "@" + myNode.getChildNodes().get(0).getLabel() + "@" + treeIndex; NonTerminal interiorChild1 = this.nonTerminals.get(myChildNodeName); myChildNodeName = myNode.getChildNodes().get(0).getName(); NonTerminal exteriorChild1 = this.nonTerminals.get(myChildNodeName); //create 2 RightHandSides for the rules, //each of them are attached to both interior and exterior nonTerminal (LHS) //two unique rules exteriorNonTerminal.addRule(new Rule(exteriorNonTerminal, interiorChild1)); interiorNonTerminal.addRule(new Rule(interiorNonTerminal, interiorChild1)); //rhsArrayofConstituents.clear(); //rhsArrayofConstituents.add(ChildNonTerminal1); //two unique rules interiorNonTerminal.addRule(new Rule(interiorNonTerminal, exteriorChild1)); tempRule = new Rule(exteriorNonTerminal, exteriorChild1); boolean bExists = false; for (Rule existingRule : exteriorNonTerminal.getRules()) { if (tempRule.equals(existingRule)) { bExists = true; } } if (!bExists ) exteriorNonTerminal.addRule(tempRule); } } } //NONTERMINALS, part 2 : add rules } } } } /* * enumerate the nodes of the parseTree, start with deepest level * add Terminals and NonTerminals to grammar, and create 8 rules for each NonTerminal node //add them to grammar with appropriate probabilities */ public void addParseTreeToGrammarNoGoodman(parseTree myParseTree){ NonTerminal exteriorNonTerminal; Terminal myTerminal; //start with deepest level and climb up the tree for (int currentDepth = myParseTree.deepestLevel; currentDepth >=0; currentDepth--) { for (Node myNode : myParseTree.getNodes()) { //LOOP over all nodes, including terminal nodes if (myNode.getDepth() == currentDepth) { String myNodeName = myNode.getName(); //first part: create Terminals or nonTerminals for the grammar if (myNode.getType()==Main.TERMINAL) { //TERMINALS //all terminals are converted to lower case myNodeName = myNodeName.toLowerCase(); //System.out.println("Node is terminal"); //if Terminal is not yet in Arraylist of grammar then add it if (!this.Terminals.containsKey(myNodeName)) { //turn node into terminal and add to grammar myTerminal = new Terminal(myNodeName); this.Terminals.put(myNodeName, myTerminal); } else myTerminal = this.Terminals.get(myNodeName); } else { //NONTERMINALS //create an EXTERIOR NonTerminal, but first check if it already exists if (!this.nonTerminals.containsKey(myNodeName)){ //create exterior NonTerminal exteriorNonTerminal = new NonTerminal(myNodeName); //add it to HashMap of nonTerminals of the grammar this.nonTerminals.put(myNodeName, exteriorNonTerminal); } else { //return reference exteriorNonTerminal = this.nonTerminals.get(myNodeName); } //PART 2: add rules - only if current node is NONTERMINAL //ArrayList rhsArrayofConstituents = new ArrayList(); Rule tempRule; //if current node is preterminal then no labels for children if (myNode.getChildNodes().size()==1 && !(myNode.getName().equals("TOP"))) { //PRETERMINAL OR TOP!! String daughterTerminalName = myNode.getChildNodes().get(0).getName().toLowerCase(); Terminal daughterTerminal = this.Terminals.get(daughterTerminalName); //System.out.println("daughterTerminalName=" + daughterTerminalName + "daughterTerminal=" + daughterTerminal.getName()); //for exterior nonTerminal, check if rule already exists Rule tempRule1 = new Rule(exteriorNonTerminal, daughterTerminal); boolean bExists = false; for (Rule existingRule : exteriorNonTerminal.getRules()) { if (tempRule1.equals(existingRule)) { //update rulecount existingRule.increaseCount(); bExists = true; } } if (!bExists ){ exteriorNonTerminal.addRule(tempRule1); } } else { //TWO or more NONTERMINAL CHILDREN or TOP if (myNode.getChildNodes().size()==2) { //find the children String myChildNodeName = myNode.getChildNodes().get(0).getName(); NonTerminal exteriorChild1 = this.nonTerminals.get(myChildNodeName); myChildNodeName = myNode.getChildNodes().get(1).getName(); NonTerminal exteriorChild2 = this.nonTerminals.get(myChildNodeName); //this rule may not be unique //tempRule = new Rule(myNodeNonTerminal, rhsArrayofConstituents); tempRule = new Rule(exteriorNonTerminal, exteriorChild1, exteriorChild2 ); if (tempRule==null) System.out.println("tempRule==null"); //System.out.println("RPPPP=" + myNodeNonTerminal.getName() + "-->" + ChildNonTerminal1.getName() + " " + ChildNonTerminal2.getName()); //System.out.println("myNodeNonTerminal=" + myNodeNonTerminal.getName()); //System.out.println("ChildNonTerminal1=" + ChildNonTerminal1.getName()); //System.out.println("ChildNonTerminal2=" + ChildNonTerminal2.getName()); boolean bExists = false; for (Rule existingRule : exteriorNonTerminal.getRules()) { if (tempRule.equals(existingRule)) { //update rulecount existingRule.increaseCount(); bExists = true; } } if (!bExists ) exteriorNonTerminal.addRule(tempRule); } else { //TOP node String myChildNodeName = myNode.getChildNodes().get(0).getName(); NonTerminal exteriorChild1 = this.nonTerminals.get(myChildNodeName); //System.out.println("myChildNodeName=" + myChildNodeName + "exteriorChild1=" + exteriorChild1.getName()); tempRule = new Rule(exteriorNonTerminal, exteriorChild1); boolean bExists = false; for (Rule existingRule : exteriorNonTerminal.getRules()) { if (tempRule.equals(existingRule)) { //update rulecount existingRule.increaseCount(); bExists = true; } } if (!bExists ) exteriorNonTerminal.addRule(tempRule); } } } //NONTERMINALS, part 2 : add rules } } } } public void prepareCFGGrammarRulesForPrinting(parseTree myParseTree, HashMap> rulesForPrinting, HashMap preterminalRulesForPrinting, HashSet terminalsForPrinting) { String myNodeName=null; StringBuffer myRuleString; String preterminalRule = null; for (Node myNode : myParseTree.getNodes()) { //LOOP over all nodes, including terminal nodes //WITHOUT SEMANTICS myNodeName = myNode.getName().split("-")[0].split("=")[0]; //XXX WITH SEMANTICS // myNodeName = myNode.getName(); //first part: create Terminals or nonTerminals for the grammar if (myNode.getType()==Main.NONTERMINAL) { //ignore terminal nodes //create StringBuffer myRuleString = new StringBuffer(); //add LHS, remove semantic annotation after - myRuleString.append(myNodeName).append(" "); //add RHS, find daughters for (Node daughterNode : myNode.getChildNodes()) { //if terminal then make it lowerCase if (!Main.GRAMMAR_PRINTING_PRETERMINALS && daughterNode.getType()==Main.TERMINAL) { myRuleString.append(daughterNode.getName().toLowerCase().split("-")[0].split("=")[0]).append(" "); } else //XXX TEMP WITH SEMANTICS //myRuleString.append(daughterNode.getName()).append(" "); myRuleString.append(daughterNode.getName().split("-")[0].split("=")[0]).append(" "); } //create entry for nonT if (rulesForPrinting.get(myNodeName)==null) { //create empty HM HashMap tempHM = new HashMap(); rulesForPrinting.put(myNodeName, tempHM); } //get the HP for this LHS nonT HashMap rulesOfNonT = rulesForPrinting.get(myNodeName); //identify this rule in HashMap, if exists increase rule count by one //if not put into HashMap with count =1 if (rulesOfNonT.get(myRuleString.toString())==null) { rulesOfNonT.put(myRuleString.toString(), new Integer(1)); } else { //already existed: increase count by one rulesOfNonT.put(myRuleString.toString(), new Integer(rulesOfNonT.get(myRuleString.toString()).intValue()+1)); } } else { //myNode.getType()==Main.TERMINAL if (Main.GRAMMAR_PRINTING_PRETERMINALS) { //create preTerminal node preterminalRule = myNodeName + " " + myNodeName.toLowerCase() + " "; if (preterminalRulesForPrinting.get(preterminalRule)==null) { preterminalRulesForPrinting.put(preterminalRule, new Integer(1)); } else { //already existed: increase count by one preterminalRulesForPrinting.put(preterminalRule, new Integer(preterminalRulesForPrinting.get(preterminalRule).intValue()+1)); } } else { terminalsForPrinting.add(myNode.getName()); } } } } public void calculateRuleProbabilities() { for(NonTerminal nt : this.nonTerminals.values()){ int totFrequency = 0; for (Rule myRule : nt.getRules()) { totFrequency += myRule.getCount(); } for (Rule myRule : nt.getRules()) { if (Main.DOGOODMANDOP) myRule.computeRuleProbability(); else myRule.setRuleProbability(((double) myRule.getCount())/((double) totFrequency)); } } } public void indexRulesAccordingToRHS() throws IOException { //Set of all rules //HashSet allRules = getAllRules(); //HashSet allRules = new HashSet(); String myRHSOne; for(NonTerminal nt : nonTerminals.values()){ //allRules.addAll(); for(Rule myRule : nt.getRules()) { //for every RHS there is a HashSet of rules //this HashSet together with the RHS are put into HashMap rulesIndexedByUnaryRHS myRHSOne = myRule.getRightHandSide().get(0).getName(); if (myRule.getRightHandSide().size() ==2) { //only reason to store it is for binary rules in CYK induction //XXX TEMP SOLUTION check for BUG //if (myRule.getRightHandSide().get(0)!=null && myRule.getRightHandSide().get(1)!=null) { if (rulesIndexedByBodyLeftNonT.get(myRHSOne)== null) { //create new HashSet; HashSet tempHashSet = new HashSet(); tempHashSet.add(myRule); this.rulesIndexedByBodyLeftNonT.put(myRHSOne, tempHashSet); } else{ HashSet tempHashSet = this.rulesIndexedByBodyLeftNonT.get(myRHSOne); tempHashSet.add(myRule); } //} //else { //System.out.println("BUG for rule " + myRule.getRightHandSide().get(0)) //} } else { if (rulesIndexedByUnaryRHS.get((myRHSOne + "*"))== null) { //create new HashSet; HashSet tempHashSet = new HashSet(); tempHashSet.add(myRule); this.rulesIndexedByUnaryRHS.put(myRHSOne + "*", tempHashSet); } else{ HashSet tempHashSet = this.rulesIndexedByUnaryRHS.get(myRHSOne + "*"); tempHashSet.add(myRule); } } } } } public HashSet getAllRules(){ //Set of all rules HashSet allRules = new HashSet(); for(NonTerminal nt : nonTerminals.values()){ allRules.addAll(nt.getRules()); } return allRules; } public HashSet getRulesFromRHS(String myRHS){ return this.rulesIndexedByUnaryRHS.get(myRHS); } public Object clone() { Grammar result = new Grammar(); result.Terminals = (HashMap) this.Terminals.clone(); result.nonTerminals = (HashMap) this.nonTerminals.clone(); //result.rulesIndexedByUnaryRHS = (HashMap>) this.rulesIndexedByUnaryRHS.clone(); return result; } }