From 46ec94c80a1141654836971ecacfe9c551ea70c9 Mon Sep 17 00:00:00 2001
From: Christopher Potts <cgpotts@gmail.com>
Date: Sun, 2 Feb 2020 11:39:46 -0800
Subject: [PATCH] Optional y arg to tree network fit methods to allow
 cross-validation

---
 np_model_base.py             |  5 +++
 np_tree_nn.py                | 41 ++++++++++++++++++++---
 sst_03_neural_networks.ipynb | 19 +++++++++++
 test/test_models.py          | 22 +++++++++---
 torch_tree_nn.py             | 65 ++++++++++++++++++------------------
 5 files changed, 110 insertions(+), 42 deletions(-)

diff --git a/np_model_base.py b/np_model_base.py
index 1ae2a19..bd2acc4 100644
--- a/np_model_base.py
+++ b/np_model_base.py
@@ -46,6 +46,10 @@ def fit(self, X, y):
         y : list
            The one-hot label vector.
 
+        Returns
+        ----------
+        self
+
         """
         y = self.prepare_output_data(y)
         self.initialize_parameters()
@@ -75,6 +79,7 @@ def fit(self, X, y):
                     progress_bar(
                         "Finished epoch {} of {}; error is {}".format
                         (iteration, self.max_iter, error))
+        return self
 
     @staticmethod
     def get_error(predictions, labels):
diff --git a/np_tree_nn.py b/np_tree_nn.py
index 5a46b11..c0f1677 100644
--- a/np_tree_nn.py
+++ b/np_tree_nn.py
@@ -19,7 +19,25 @@ def __init__(self, vocab, embedding=None, embed_dim=50, **kwargs):
         self.hidden_dim = self.embed_dim * 2
 
     def fit(self, X, y=None):
-        y = [t.label() for t in X]
+        """Fairly standard `fit` method except that, if `y=None`,
+        then the labels `y` are presumed to come from the root nodes
+        of the trees in `X`. We retain the option of giving them
+        as a separate argument for consistency with the other model
+        interfaces, and so that we can use sklearn cross-validation
+        methods with this class.
+
+        Parameters
+        ----------
+        X : list of `nltk.Tree` instances
+        y : iterable of labels, or None
+
+        Returns
+        -------
+        self
+
+        """
+        if y is None:
+            y = [t.label() for t in X]
         return super(TreeNN, self).fit(X, y)
 
     def initialize_parameters(self):
@@ -160,7 +178,7 @@ def set_params(self, **params):
         self.hidden_dim = self.embed_dim * 2
 
 
-def simple_example():
+def simple_example(initial_embedding=False, separate_y=False):
     from nltk.tree import Tree
     import utils
 
@@ -190,13 +208,26 @@ def simple_example():
 
     X_test = [Tree.fromstring(x) for x in test]
 
+    if initial_embedding:
+        import numpy as np
+        embedding = np.random.uniform(
+            low=-1.0, high=1.0, size=(len(vocab), 50))
+    else:
+        embedding = None
+
     model = TreeNN(
         vocab,
         embed_dim=50,
         hidden_dim=50,
-        max_iter=100)
+        max_iter=100,
+        embedding=embedding)
+
+    if not separate_y:
+        y = [t.label() for t in X_train]
+    else:
+        y = None
 
-    model.fit(X_train)
+    model.fit(X_train, y=y)
 
     print("\nTest predictions:")
 
@@ -213,4 +244,4 @@ def simple_example():
 
 
 if __name__ == '__main__':
-    simple_example()
+    simple_example(initial_embedding=False, separate_y=False)
diff --git a/sst_03_neural_networks.ipynb b/sst_03_neural_networks.ipynb
index 0d01e3e..73acdb4 100644
--- a/sst_03_neural_networks.ipynb
+++ b/sst_03_neural_networks.ipynb
@@ -914,6 +914,18 @@
     "%time _ = tree_nn_glove.fit(X_tree_train)"
    ]
   },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Important: if you wish to cross-validate this model using scikit-learn methods, then you'll need to give the labels as a separate argument, as in \n",
+    "\n",
+    "```\n",
+    "y_tree_train = [t.label() for t in X_tree_train]\n",
+    "tree_nn_glove.fit(X_tree_train, y_tree_train)\n",
+    "```"
+   ]
+  },
   {
    "cell_type": "code",
    "execution_count": 41,
@@ -995,6 +1007,13 @@
     "%time _ = torch_tree_nn_glove.fit(X_tree_train)"
    ]
   },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "As with `TreeNN` above, you have the option of specifying the labels separately, and this is required if you are cross-validating the model using scikit-learn methods."
+   ]
+  },
   {
    "cell_type": "code",
    "execution_count": 45,
diff --git a/test/test_models.py b/test/test_models.py
index 46d705a..b727860 100644
--- a/test/test_models.py
+++ b/test/test_models.py
@@ -270,13 +270,24 @@ def test_torch_autoencoder_simple_example():
     assert mse < 0.0001
 
 
-def test_np_tree_nn_simple_example():
-    np_tree_nn.simple_example()
+@pytest.mark.parametrize("initial_embedding, separate_y", [
+    [True, True],
+    [True, False],
+    [False, True],
+    [False, False]
+])
+def test_np_tree_nn_simple_example(initial_embedding, separate_y):
+    np_tree_nn.simple_example(initial_embedding, separate_y)
 
 
-@pytest.mark.parametrize("initial_embedding", [True, False])
-def test_torch_tree_nn_simple_example(initial_embedding):
-    torch_tree_nn.simple_example(initial_embedding)
+@pytest.mark.parametrize("initial_embedding, separate_y", [
+    [True, True],
+    [True, False],
+    [False, True],
+    [False, False]
+])
+def test_torch_tree_nn_simple_example(initial_embedding, separate_y):
+    torch_tree_nn.simple_example(initial_embedding, separate_y)
 
 
 def test_torch_tree_nn_incremental(X_tree):
@@ -437,6 +448,7 @@ def test_torch_rnn_classifier_save_load(X_sequence):
         mod2.predict(X_test)
         mod2.fit(X, y)
 
+
 def test_torch_tree_nn_save_load(X_tree):
     X, vocab = X_tree
     mod = torch_tree_nn.TorchTreeNN(
diff --git a/torch_tree_nn.py b/torch_tree_nn.py
index 5ee246a..d211294 100644
--- a/torch_tree_nn.py
+++ b/torch_tree_nn.py
@@ -32,8 +32,7 @@ def _define_embedding(self, embedding):
 
     def forward(self, tree):
         """Recursively interprets `tree`, applying a classifier layer
-        to the final representation. The label comes from the root
-        of the tree itself.
+        to the final representation.
 
         Parameters
         ----------
@@ -45,7 +44,7 @@ def forward(self, tree):
 
         """
         root = self.interpret(tree)
-        return self.classifier_layer(root), tree.label()
+        return self.classifier_layer(root)
 
     def interpret(self, subtree):
         # Terminal nodes are str:
@@ -85,13 +84,18 @@ def build_graph(self):
             output_dim=self.n_classes_,
             hidden_activation=self.hidden_activation)
 
-    def fit(self, X, **kwargs):
-        """Fairly standard `fit` method except that the labels `y` are
-        presumed to come from the root nodes of the trees in `X`.
+    def fit(self, X, y=None, **kwargs):
+        """Fairly standard `fit` method except that, if `y=None`,
+        then the labels `y` are presumed to come from the root nodes
+        of the trees in `X`. We retain the option of giving them
+        as a separate argument for consistency with the other model
+        interfaces, and so that we can use sklearn cross-validation
+        methods with this class.
 
         Parameters
         ----------
         X : list of `nltk.Tree` instances
+        y : iterable of labels, or None
         kwargs : dict
             For passing other parameters. If 'X_dev' is included,
             then performance is monitored every 10 epochs; use
@@ -102,28 +106,35 @@ def fit(self, X, **kwargs):
         self
 
         """
+        # Labels:
+        if y is None:
+            y = [t.label() for t in X]
+        self.classes_ = sorted(set(y))
+        self.n_classes_ = len(self.classes_)
+        self.class2index = dict(zip(self.classes_, range(self.n_classes_)))
+
         # Incremental performance:
         X_dev = kwargs.get('X_dev')
         if X_dev is not None:
             dev_iter = kwargs.get('dev_iter', 10)
-        # Data prep:
-        self.classes_ = self.get_classes(X)
-        self.n_classes_ = len(self.classes_)
-        self.class2index = dict(zip(self.classes_, range(self.n_classes_)))
+
         # Model:
         if not self.warm_start or not hasattr(self, "model"):
             self.model = self.build_graph()
         self.model.to(self.device)
         self.model.train()
+
         # Optimization:
         loss = nn.CrossEntropyLoss()
         optimizer = self.optimizer(self.model.parameters(), lr=self.eta)
+
         # Train:
+        dataset = list(zip(X, y))
         for iteration in range(1, self.max_iter+1):
             epoch_error = 0.0
-            random.shuffle(X)
-            for tree in X:
-                pred, label = self.model.forward(tree)
+            random.shuffle(dataset)
+            for tree, label in dataset:
+                pred = self.model.forward(tree)
                 label = self.convert_label(label)
                 err = loss(pred, label)
                 epoch_error += err.item()
@@ -140,21 +151,6 @@ def fit(self, X, **kwargs):
                     iteration, self.max_iter, epoch_error/len(X)))
         return self
 
-    @staticmethod
-    def get_classes(X):
-        """Classes as given by the root nodes in `X`.
-
-        Parameters
-        ----------
-        X : list of nltk.tree.Tree
-
-        Returns
-        -------
-        list
-
-        """
-        return sorted({t.label() for t in X})
-
     def convert_label(self, label):
         """Convert a class label to a format that PyTorch can handle.
 
@@ -186,7 +182,7 @@ def predict_proba(self, X):
         with torch.no_grad():
             preds = []
             for tree in X:
-                pred, _ = self.model.forward(tree)
+                pred = self.model.forward(tree)
                 preds.append(pred.squeeze())
             preds = torch.stack(preds)
             return torch.softmax(preds, dim=1).numpy()
@@ -209,7 +205,7 @@ def predict(self, X):
         return [self.classes_[i] for i in probs.argmax(axis=1)]
 
 
-def simple_example(initial_embedding=False):
+def simple_example(initial_embedding=False, separate_y=False):
     from nltk.tree import Tree
 
     train = [
@@ -249,7 +245,12 @@ def simple_example(initial_embedding=False):
         max_iter=50,
         embedding=embedding)
 
-    mod.fit(X_train)
+    if separate_y:
+        y = [t.label() for t in X_train]
+    else:
+        y = None
+
+    mod.fit(X_train, y=y)
 
     print("\nTest predictions:")
 
@@ -266,4 +267,4 @@ def simple_example(initial_embedding=False):
 
 
 if __name__ == '__main__':
-    simple_example()
+    simple_example(separate_y=True)