Add an option where the boundary 'words' of a sentence are used as the sinks in the constituency relative attention module

AngledLuffa · AngledLuffa · commit 10dd38f3af46 · 2025-04-07T09:06:03.000-07:00
diff --git a/stanza/models/common/relative_attn.py b/stanza/models/common/relative_attn.py
@@ -47,7 +47,7 @@ def __init__(self, d_model, num_heads, window=8, dropout=0.2, reverse=False, d_o
 
         self.reverse = reverse
 
-    def forward(self, x):
+    def forward(self, x, sink=None):
         # x.shape == (batch_size, seq_len, d_model)
         batch_size, seq_len, d_model = x.shape
         if d_model != self.d_model:
@@ -66,7 +66,10 @@ def forward(self, x):
             # could keep a parameter to train sinks, but as it turns out,
             # the position vectors just overlap that parameter space anyway
             # generally the model trains the sinks to zero if we do that
-            sink = torch.zeros((batch_size, self.num_sinks, d_model), dtype=x.dtype, device=x.device)
+            if sink is None:
+                sink = torch.zeros((batch_size, self.num_sinks, d_model), dtype=x.dtype, device=x.device)
+            else:
+                sink = sink.expand(batch_size, self.num_sinks, d_model)
             x = torch.cat((sink, x), axis=1)
 
         # k.shape = (batch_size, num_heads, d_head, seq_len + num_sinks)
diff --git a/stanza/models/constituency/lstm_model.py b/stanza/models/constituency/lstm_model.py
@@ -824,9 +824,12 @@ def map_word(word):
             rattn_inputs = [[x] for x in all_word_inputs]
 
             if self.rel_attn_forward is not None:
-                rattn_inputs = [x + [self.rel_attn_forward(x[0].unsqueeze(0)).squeeze(0)] for x in rattn_inputs]
+                if self.args['rattn_use_endpoint_sinks']:
+                    rattn_inputs = [x + [self.rel_attn_forward(x[0].unsqueeze(0), x[0][0]).squeeze(0)] for x in rattn_inputs]
+                else:
+                    rattn_inputs = [x + [self.rel_attn_forward(x[0].unsqueeze(0)).squeeze(0)] for x in rattn_inputs]
             if self.rel_attn_reverse is not None:
-                rattn_inputs = [x + [self.rel_attn_reverse(x[0].unsqueeze(0)).squeeze(0)] for x in rattn_inputs]
+                rattn_inputs = [x + [self.rel_attn_reverse(x[0].unsqueeze(0), x[0][-1]).squeeze(0)] for x in rattn_inputs]
 
             if self.args['rattn_cat']:
                 all_word_inputs = [torch.cat(x, axis=1) for x in rattn_inputs]
diff --git a/stanza/models/constituency_parser.py b/stanza/models/constituency_parser.py
@@ -737,6 +737,7 @@ def build_argparse():
     parser.add_argument('--rattn_cat', default=True, action='store_true', help='Stack the rattn layers instead of adding them')
     parser.add_argument('--rattn_dim', default=200, type=int, help='Dimension of the rattn output when cat')
     parser.add_argument('--rattn_sinks', default=0, type=int, help='Number of attention sink tokens to learn')
+    parser.add_argument('--rattn_use_endpoint_sinks', default=False, action='store_true', help='Use the endpoints of the sentences as sinks')
 
     parser.add_argument('--log_norms', default=False, action='store_true', help='Log the parameters norms while training.  A very noisy option')
     parser.add_argument('--log_shapes', default=False, action='store_true', help='Log the parameters shapes at the beginning')