Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Collinizer #1344

Merged
merged 4 commits into from
Feb 26, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
14 changes: 14 additions & 0 deletions src/edu/stanford/nlp/parser/lexparser/AbstractCollinizer.java
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
package edu.stanford.nlp.parser.lexparser;

import edu.stanford.nlp.trees.Tree;

/**
* Interface for the Collinizers
*<br>
* TODO: pass in both the guess and the gold
*
* @author John Bauer
*/
public interface AbstractCollinizer {
Tree transformTree(Tree guess, Tree gold);
}
Original file line number Diff line number Diff line change
Expand Up @@ -336,7 +336,7 @@ public double[] MLEDependencyGrammarSmoothingParams() {
* tree. Should strip punctuation and maybe do some other things.
*/
@Override
public abstract TreeTransformer collinizer();
public abstract AbstractCollinizer collinizer();

/**
* the tree transformer used to produce trees for evaluation. Will
Expand All @@ -346,7 +346,7 @@ public double[] MLEDependencyGrammarSmoothingParams() {
* off. (finish this doc!)
*/
@Override
public abstract TreeTransformer collinizerEvalb();
public abstract AbstractCollinizer collinizerEvalb();


/**
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -207,15 +207,15 @@ public TreeTransformer subcategoryStripper() {
* The collinizer eliminates punctuation
*/
@Override
public TreeTransformer collinizer() {
public AbstractCollinizer collinizer() {
return new TreeCollinizer(tlp, !collinizerRetainsPunctuation, false);
}

/**
* Stand-in collinizer does nothing to the tree.
*/
@Override
public TreeTransformer collinizerEvalb() {
public AbstractCollinizer collinizerEvalb() {
return collinizer();
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -282,7 +282,7 @@ public static void main(String[] args) throws IOException {
FileFilter testFilt = new NumberRangesFileFilter(testArgs[1], false);
testTreebank.loadPath(new File(testArgs[0]), testFilt);
TreeTransformer subcategoryStripper = op.tlpParams.subcategoryStripper();
TreeTransformer collinizer = ctpp.collinizer();
AbstractCollinizer collinizer = ctpp.collinizer();

WordCatEquivalenceClasser eqclass = new WordCatEquivalenceClasser();
WordCatEqualityChecker eqcheck = new WordCatEqualityChecker();
Expand Down Expand Up @@ -371,8 +371,8 @@ public static void main(String[] args) throws IOException {
System.out.println("\nScores:");
basicEval.displayLast();

Tree collinsTree = collinizer.transformTree(tree);
Tree collinsGold = collinizer.transformTree(gold);
Tree collinsTree = collinizer.transformTree(tree, gold);
Tree collinsGold = collinizer.transformTree(gold, gold);
ourBrackets = proc.allBrackets(collinsTree);
goldBrackets = proc.allBrackets(collinsGold);
if (goodPOS) {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -178,15 +178,15 @@ public MemoryTreebank memoryTreebank() {
* Returns a ChineseCollinizer
*/
@Override
public TreeTransformer collinizer() {
public AbstractCollinizer collinizer() {
return new ChineseCollinizer(ctlp);
}

/**
* Returns a ChineseCollinizer that doesn't delete punctuation
*/
@Override
public TreeTransformer collinizerEvalb() {
public AbstractCollinizer collinizerEvalb() {
return new ChineseCollinizer(ctlp, false);
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -221,12 +221,12 @@ public MemoryTreebank testMemoryTreebank() {
* be applied both to the parser output and the gold tree.
*/
@Override
public TreeTransformer collinizer() {
public AbstractCollinizer collinizer() {
return new TreeCollinizer(tlp, true, englishTrain.splitBaseNP == 2, englishTrain.collapseWhCategories);
}

@Override
public TreeTransformer collinizerEvalb() {
public AbstractCollinizer collinizerEvalb() {
return new TreeCollinizer(tlp, true, englishTrain.splitBaseNP == 2, englishTrain.collapseWhCategories);
}

Expand Down
24 changes: 12 additions & 12 deletions src/edu/stanford/nlp/parser/lexparser/FactoredParser.java
Original file line number Diff line number Diff line change
Expand Up @@ -461,8 +461,8 @@ public static void main(String[] args) {
depDE.evaluate(tree3, binaryTree, pw);
depTE.evaluate(tree3db, tree, pw);
}
TreeTransformer tc = op.tlpParams.collinizer();
TreeTransformer tcEvalb = op.tlpParams.collinizerEvalb();
AbstractCollinizer tc = op.tlpParams.collinizer();
AbstractCollinizer tcEvalb = op.tlpParams.collinizerEvalb();
if (op.doPCFG) {
// System.out.println("XXXX Best PCFG was: ");
// tree2.pennPrint();
Expand All @@ -471,8 +471,8 @@ public static void main(String[] args) {
//System.out.println("True Best Parse:");
//tree.pennPrint();
//tc.transformTree(tree).pennPrint();
pcfgPE.evaluate(tc.transformTree(tree2), tc.transformTree(tree), pw);
pcfgCB.evaluate(tc.transformTree(tree2), tc.transformTree(tree), pw);
pcfgPE.evaluate(tc.transformTree(tree2, tree2), tc.transformTree(tree, tree), pw);
pcfgCB.evaluate(tc.transformTree(tree2, tree2), tc.transformTree(tree, tree), pw);
Tree tree4b = null;
if (op.doDep) {
comboDE.evaluate((bothPassed ? tree4 : tree3), binaryTree, pw);
Expand All @@ -483,15 +483,15 @@ public static void main(String[] args) {
tree4 = np.prune(tree4);
}
//tree4.pennPrint();
comboPE.evaluate(tc.transformTree(tree4), tc.transformTree(tree), pw);
comboPE.evaluate(tc.transformTree(tree4, tree4), tc.transformTree(tree, tree), pw);
}
//pcfgTE.evaluate(tree2, tree);
pcfgTE.evaluate(tcEvalb.transformTree(tree2), tcEvalb.transformTree(tree), pw);
pcfgTEnoPunct.evaluate(tc.transformTree(tree2), tc.transformTree(tree), pw);
pcfgTE.evaluate(tcEvalb.transformTree(tree2, tree2), tcEvalb.transformTree(tree, tree), pw);
pcfgTEnoPunct.evaluate(tc.transformTree(tree2, tree2), tc.transformTree(tree, tree), pw);

if (op.doDep) {
comboTE.evaluate(tcEvalb.transformTree(tree4), tcEvalb.transformTree(tree), pw);
comboTEnoPunct.evaluate(tc.transformTree(tree4), tc.transformTree(tree), pw);
comboTE.evaluate(tcEvalb.transformTree(tree4, tree4), tcEvalb.transformTree(tree, tree), pw);
comboTEnoPunct.evaluate(tc.transformTree(tree4, tree4), tc.transformTree(tree, tree), pw);
}
System.out.println("PCFG only: " + parser.scoreBinarizedTree(tree2b, 0));

Expand All @@ -515,11 +515,11 @@ public static void main(String[] args) {

if (op.testOptions.evalb) {
if (op.doPCFG && op.doDep) {
EvalbFormatWriter.writeEVALBline(tcEvalb.transformTree(tree), tcEvalb.transformTree(tree4));
EvalbFormatWriter.writeEVALBline(tcEvalb.transformTree(tree, tree), tcEvalb.transformTree(tree4, tree4));
} else if (op.doPCFG) {
EvalbFormatWriter.writeEVALBline(tcEvalb.transformTree(tree), tcEvalb.transformTree(tree2));
EvalbFormatWriter.writeEVALBline(tcEvalb.transformTree(tree, tree), tcEvalb.transformTree(tree2, tree2));
} else if (op.doDep) {
EvalbFormatWriter.writeEVALBline(tcEvalb.transformTree(tree), tcEvalb.transformTree(tree3db));
EvalbFormatWriter.writeEVALBline(tcEvalb.transformTree(tree, tree), tcEvalb.transformTree(tree3db, tree3db));
}
}
} // end for each tree in test treebank
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -549,12 +549,12 @@ public String[] sisterSplitters() {
}

@Override
public TreeTransformer collinizer() {
public AbstractCollinizer collinizer() {
return new TreeCollinizer(treebankLanguagePack());
}

@Override
public TreeTransformer collinizerEvalb() {
public AbstractCollinizer collinizerEvalb() {
return new TreeCollinizer(treebankLanguagePack(),collinizerRetainsPunctuation,false);
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -24,13 +24,13 @@ protected GenericTreebankParserParams(TreebankLanguagePack tlp) {
}

@Override
public TreeTransformer collinizer() {
public AbstractCollinizer collinizer() {
// TODO Auto-generated method stub
return null;
}

@Override
public TreeTransformer collinizerEvalb() {
public AbstractCollinizer collinizerEvalb() {
// TODO Auto-generated method stub
return null;
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -47,15 +47,15 @@ protected HebrewTreebankParserParams(TreebankLanguagePack tlp) {
}

@Override
public TreeTransformer collinizer() {
public AbstractCollinizer collinizer() {
return new TreeCollinizer(tlp, true, false);
}

/**
* Stand-in collinizer does nothing to the tree.
*/
@Override
public TreeTransformer collinizerEvalb() {
public AbstractCollinizer collinizerEvalb() {
return collinizer();
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -53,12 +53,12 @@ public HeadFinder typedDependencyHeadFinder() {
}

@Override
public TreeTransformer collinizer() {
public AbstractCollinizer collinizer() {
return new TreeCollinizer(tlp, true, false, 0);
}

@Override
public TreeTransformer collinizerEvalb() {
public AbstractCollinizer collinizerEvalb() {
return collinizer();
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -53,12 +53,12 @@ public HeadFinder typedDependencyHeadFinder() {
}

@Override
public TreeTransformer collinizer() {
public AbstractCollinizer collinizer() {
return new TreeCollinizer(tlp, true, false, 0);
}

@Override
public TreeTransformer collinizerEvalb() {
public AbstractCollinizer collinizerEvalb() {
return collinizer();
}

Expand Down
47 changes: 29 additions & 18 deletions src/edu/stanford/nlp/parser/lexparser/NegraPennCollinizer.java
Original file line number Diff line number Diff line change
Expand Up @@ -2,17 +2,18 @@
import edu.stanford.nlp.util.logging.Redwood;

import java.util.ArrayList;
import java.util.Iterator;
import java.util.List;

import edu.stanford.nlp.ling.Label;
import edu.stanford.nlp.ling.StringLabel;
import edu.stanford.nlp.trees.LabeledScoredTreeFactory;
import edu.stanford.nlp.trees.Tree;
import edu.stanford.nlp.trees.Trees;
import edu.stanford.nlp.trees.TreeFactory;
import edu.stanford.nlp.trees.TreeTransformer;


public class NegraPennCollinizer implements TreeTransformer {
public class NegraPennCollinizer implements AbstractCollinizer {

/** A logger for this class */
Redwood.RedwoodChannels log = Redwood.channels(NegraPennCollinizer.class);
Expand All @@ -31,37 +32,47 @@ public NegraPennCollinizer(TreebankLangParserParams tlpp, boolean deletePunct) {

protected TreeFactory tf = new LabeledScoredTreeFactory();

public Tree transformTree(Tree tree) {
Label l = tree.label();
if (tree.isLeaf()) {
public Tree transformTree(Tree guess, Tree gold) {
if (guess == null || gold == null) return null;
if (guess.yield().size() != gold.yield().size()) {
return null;
}

return transformTree(guess, Trees.preTerminals(gold).iterator());
}

private Tree transformTree(Tree guess, Iterator<Tree> goldPreterminals) {
Label l = guess.label();
if (guess.isLeaf()) {
return tf.newLeaf(l);
}
String s = l.value();
s = tlpp.treebankLanguagePack().basicCategory(s);
if (deletePunct) {
// this is broken as it's not the right thing to do when there
// is any tag ambiguity -- and there is for ' (POS/''). Sentences
// can then have more or less words. It's also unnecessary for EVALB,
// since it ignores punctuation anyway
if (tree.isPreTerminal() && tlpp.treebankLanguagePack().isEvalBIgnoredPunctuationTag(s)) {
if (deletePunct && guess.isPreTerminal()) {
// Eliminate unwanted (in terms of evaluation) punctuation
// by comparing the gold punctuation, not the guess tree
// This way, retagging does not change the results
Tree goldPT = goldPreterminals.next();
String goldTag = tlpp.treebankLanguagePack().basicCategory(goldPT.value());
if (tlpp.treebankLanguagePack().isEvalBIgnoredPunctuationTag(goldTag)) {
return null;
}
}
// TEMPORARY: eliminate the TOPP constituent
if (tree.children()[0].label().value().equals("TOPP")) {
if (guess.children()[0].label().value().equals("TOPP")) {
log.info("Found a TOPP");
tree.setChildren(tree.children()[0].children());
guess.setChildren(guess.children()[0].children());
}

// Negra has lots of non-unary roots; delete unary roots
if (tlpp.treebankLanguagePack().isStartSymbol(s) && tree.numChildren() == 1) {
if (tlpp.treebankLanguagePack().isStartSymbol(s) && guess.numChildren() == 1) {
// NB: This deletes the boundary symbol, which is in the tree!
return transformTree(tree.getChild(0));
return transformTree(guess.getChild(0), goldPreterminals);
}
List<Tree> children = new ArrayList<>();
for (int cNum = 0, numC = tree.numChildren(); cNum < numC; cNum++) {
Tree child = tree.getChild(cNum);
Tree newChild = transformTree(child);
for (int cNum = 0, numC = guess.numChildren(); cNum < numC; cNum++) {
Tree child = guess.getChild(cNum);
Tree newChild = transformTree(child, goldPreterminals);
if (newChild != null) {
children.add(newChild);
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -125,15 +125,15 @@ public DiskTreebank diskTreebank() {
* returns a NegraPennCollinizer
*/
@Override
public TreeTransformer collinizer() {
public AbstractCollinizer collinizer() {
return new NegraPennCollinizer(this);
}

/**
* returns a NegraPennCollinizer
*/
@Override
public TreeTransformer collinizerEvalb() {
public AbstractCollinizer collinizerEvalb() {
return new NegraPennCollinizer(this, false);
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -262,12 +262,12 @@ public String[] sisterSplitters() {
}

@Override
public TreeTransformer collinizer() {
public AbstractCollinizer collinizer() {
return new TreeCollinizer(treebankLanguagePack());
}

@Override
public TreeTransformer collinizerEvalb() {
public AbstractCollinizer collinizerEvalb() {
return new TreeCollinizer(treebankLanguagePack());
}

Expand Down
Loading