nplan-io
diff --git a/‎src/transformers/models/bloom/causal_message_passing.py‎
Lines changed: 229 additions & 100 deletions b/‎src/transformers/models/bloom/causal_message_passing.py‎
Lines changed: 229 additions & 100 deletions
@@ -1,12 +1,17 @@
-""" A set of functions to perform message passing on a serialized graph in an LLM """
+""" A module for learning to pass information between elements on a serialized graph in an LLM
+    without violating the causality constraint of autoregressive generation (passing information
+    backwards in the sequence)
+"""
 
 import enum
+from functools import partial
 from collections import defaultdict
 import itertools
 from typing import Dict, List, Optional, Tuple, Union
 
 import numpy as np
 import torch
+from torch_scatter import scatter, scatter_softmax
 import torch_geometric
 
 from .desequence_graph_ids import SequenceElement
@@ -18,92 +23,49 @@ class GNNLayerFactory(enum.Enum):
     gat = torch_geometric.nn.GATConv
 
 
-def build_message_passing_matrices(
-    token_ids: torch.Tensor,
-    edge_sequences: List[List[Tuple[SequenceElement, Optional[SequenceElement], Optional[SequenceElement]]]]
-) -> List[Dict[str, torch.Tensor]]:
-    """ Returns the adjacency matrices required to perform causal message passing in between
-        language model blocks of an autoregressive language model
+def graph_cross_attention(
+    values: torch.Tensor,
+    key_representations: torch.Tensor,
+    query_representations: torch.Tensor,
+    edge_index: torch.Tensor
+) -> torch.Tensor:
+    """ Performs graph attention on a set of prior probabilities uing the representation of each
+        node in the graph to calculate the attention weights. The implemented attention is dot
+        product attention as implemented in the transformer architecture
     """
-    message_passing_dicts = []
-    for edge_sequence in edge_sequences:
-        message_passing_dict = defaultdict(list)
-        node2edge_idxs = defaultdict(list)
-        prev_node_idx = defaultdict(lambda: -1)
-
-        def add_element(end_idx: int, element_type: str):
-            """ Adds an element to the edge or node graphs used for message passing """
-            assert element_type in ['nodes', 'edges']
-            message_passing_dict[f"tokens2{element_type}"].append(end_idx - 1)
-            message_passing_dict[f"{element_type}2tokens"].append(end_idx)
-
-        for edge_idx, sequenced_edge in enumerate(edge_sequence):
-            pred_node, edge, succ_node = sequenced_edge
-            if edge_idx == len(edge_sequence) - 1:
-                if (
-                    not isinstance(succ_node, SequenceElement)
-                    and not isinstance(edge, SequenceElement)
-                ):
-                    continue
-                else:
-                    add_element(pred_node.end_idx, 'nodes')
-                    num_nodes = len(message_passing_dict["tokens2nodes"])
-                    if prev_node_idx[pred_node.ids] != -1:
-                        message_passing_dict['edge_index_nodes'].append(
-                            [prev_node_idx[pred_node.ids], num_nodes - 1]
-                        )
-            else:
-                add_element(pred_node.end_idx, 'nodes')
-                add_element(succ_node.end_idx, 'edges')
-                add_element(succ_node.end_idx, 'nodes')
-                node2edge_idxs[pred_node.ids].append(edge_idx)
-                node2edge_idxs[succ_node.ids].append(edge_idx)
-                num_nodes = len(message_passing_dict["tokens2nodes"])
-                message_passing_dict['edge_index_nodes'].append([num_nodes - 2, num_nodes - 1])
-                if prev_node_idx[pred_node.ids] != -1:
-                    message_passing_dict['edge_index_nodes'].append(
-                        [prev_node_idx[pred_node.ids], num_nodes - 2]
-                    )
-                if prev_node_idx[succ_node.ids] != -1:
-                    message_passing_dict['edge_index_nodes'].append(
-                        [prev_node_idx[succ_node.ids], num_nodes - 1]
-                    )
-                prev_node_idx[pred_node.ids] = num_nodes - 2
-                prev_node_idx[succ_node.ids] = num_nodes - 1
+    scaling_constant = torch.Tensor(
+        np.sqrt([key_representations.size(1)])
+    ).to(key_representations.device)
+    dot_products = (
+        query_representations[edge_index[1]]
+        * key_representations[edge_index[0]]
+    ).sum(1) / scaling_constant
+    weights = scatter_softmax(src=dot_products, index=edge_index[1], dim=0)
+    weighted_probs = weights.unsqueeze(1) * values[edge_index[0]]
+    return scatter(src=weighted_probs, index=edge_index[1], dim=0)
 
-        for edge_idxs in node2edge_idxs.values():
-            if len(edge_idxs) < 2:
-                continue
-            for (idx0, idx1) in itertools.combinations(list(set(edge_idxs)), 2):
-                message_passing_dict['edge_index_edges'].append(sorted([idx0, idx1]))
 
-        def to_torch(array: Union[List[int], List[List[int]]]) -> torch.Tensor:
-            """ Converts an array to a torch Tensor and returns it"""
-            if len(array) == 0 or isinstance(array[0], int):
-                return torch.from_numpy(np.array(array)).long().to(token_ids.device)
-            else:
-                return torch.from_numpy(np.array(array).transpose(1, 0)).long().to(token_ids.device)
+class GatedGraphCrossAttentionLayer(torch.nn.Module):
+    """ A module for performing gated cross attention between elements in a graph that
+        have been serialized in a sequence of tokens and the token sequence
 
-        message_passing_dict['tokens2edges'] = to_torch(message_passing_dict['tokens2edges'])
-        message_passing_dict['edges2tokens'] = to_torch(message_passing_dict['edges2tokens'])
-        message_passing_dict['tokens2nodes'] = to_torch(message_passing_dict['tokens2nodes'])
-        message_passing_dict['nodes2tokens'] = to_torch(message_passing_dict['nodes2tokens'])
-        message_passing_dict['edge_index_nodes'] = to_torch(message_passing_dict['edge_index_nodes'])
-        message_passing_dict['edge_index_edges'] = to_torch(message_passing_dict['edge_index_edges'])
-        message_passing_dicts.append(dict(message_passing_dict))
-    return message_passing_dicts
+        a key element of this layer is that it enforces that information about elements in the graph
+        can only be passed to tokens describing later elements in the sequence
 
+        This layer contains methods to pass information either between nodes or edges within
+        the serialized graph
 
-class CausalMessagePassingLayer(torch.nn.Module):
-    """ A torch.nn.Module for performing causal message passing within an autoregressive
-        language model
+        This layer is heavily inspired by Flamingo a paper on incorporating image information
+        into LLM inference - https://arxiv.org/pdf/2204.14198
     """
     def __init__(self, gnn_type: str, embedding_size: int):
         super().__init__()
-        self.nodes_layer = GNNLayerFactory[gnn_type].value(embedding_size, embedding_size)
-        self.edges_layer = GNNLayerFactory[gnn_type].value(embedding_size, embedding_size)
-        self.gating_parameter_a = torch.nn.Parameter(torch.zeros(1))
-        self.gating_parameter_b = torch.nn.Parameter(torch.zeros(1))
+        self.gnn_layer = GNNLayerFactory[gnn_type].value(embedding_size, embedding_size)
+        self.gating_message_passing = torch.nn.Parameter(torch.zeros(1))
+        self.gating_linear = torch.nn.Parameter(torch.zeros(1))
+        self.key_embedder = torch.nn.Linear(embedding_size, 64)
+        self.query_embedder = torch.nn.Linear(embedding_size, 64)
+        self.linear_layer = torch.nn.Linear(embedding_size, embedding_size)
 
     def forward(
         self,
@@ -112,28 +74,195 @@ def forward(
     ) -> torch.Tensor:
         new_token_embeddings = []
         for t_embeddings, message_passing_dict in zip(token_embeddings, message_passing_dicts):
-            token_edges_embeddings = torch.zeros_like(t_embeddings)
-            token_nodes_embeddings = torch.zeros_like(t_embeddings)
-            if message_passing_dict['tokens2edges'].numel() > 0:
-                edges_embeddings = t_embeddings[message_passing_dict['tokens2edges']]
-                if message_passing_dict['edge_index_edges'].numel() > 0:
-                    edges_embeddings = self.edges_layer(
-                        edges_embeddings,
-                        message_passing_dict['edge_index_edges']
-                    )
-                token_edges_embeddings[message_passing_dict['edges2tokens']] = edges_embeddings
-            if message_passing_dict['tokens2nodes'].numel() > 0:
-                nodes_embeddings = t_embeddings[message_passing_dict['tokens2nodes']]
-                if message_passing_dict['edge_index_nodes'].numel() > 0:
-                    nodes_embeddings = self.nodes_layer(
-                        nodes_embeddings,
-                        message_passing_dict['edge_index_nodes']
+            new_t_embeddings = torch.zeros_like(t_embeddings)
+            if message_passing_dict['tokens2elements'].numel() > 0:
+                element_embeddings = t_embeddings[message_passing_dict['tokens2elements']]
+                if message_passing_dict['edge_index'].numel() > 0:
+                    element_embeddings = self.gnn_layer(
+                        element_embeddings,
+                        message_passing_dict['edge_index']
                     )
-                token_nodes_embeddings[message_passing_dict['nodes2tokens']] = nodes_embeddings
-            new_t_embeddings = (
-                t_embeddings
-                + torch.tanh(self.gating_parameter_a) * token_edges_embeddings
-                + torch.tanh(self.gating_parameter_b) * token_nodes_embeddings
-            )
+                start_idx, end_idx = message_passing_dict['slice_idxs']
+                new_t_embeddings[start_idx:end_idx] = graph_cross_attention(
+                    values=element_embeddings,
+                    key_representations=self.key_embedder(element_embeddings),
+                    query_representations=self.query_embedder(t_embeddings),
+                    edge_index=message_passing_dict['elements2tokens']
+                )[start_idx:]
+            new_t_embeddings = t_embeddings + torch.tanh(self.gating_message_passing) * new_t_embeddings
+            new_t_embeddings = new_t_embeddings + torch.tanh(self.gating_linear) * self.linear_layer(new_t_embeddings)
             new_token_embeddings.append(new_t_embeddings.unsqueeze(0))
         return torch.cat(new_token_embeddings, dim=0)
+
+    @classmethod
+    def build_node_information_passing(
+        cls,
+        edge_sequences: List[List[Tuple[SequenceElement, Optional[SequenceElement], Optional[SequenceElement]]]],
+        device: torch.device
+    ) -> List[Dict[str, torch.Tensor]]:
+        """ Returns the indice mappings required to perform pass node information in between
+            language model blocks of an autoregressive language model for nodes in a serialized
+            graph
+        """
+        message_passing_dicts = []
+        for edge_sequence in edge_sequences:
+            message_passing_dict = {'tokens2elements': [], 'elements2tokens': [], 'edge_index': []}
+            add_node = partial(
+                cls.add_node,
+                end_idx=cls.get_sequence_end(edge_sequence),
+                last_occurence_idx=defaultdict(lambda: -1),
+                message_passing_dict=message_passing_dict
+            )
+            for edge_idx, sequenced_edge in enumerate(edge_sequence):
+                pred_node, edge, succ_node = sequenced_edge
+                if edge_idx == len(edge_sequence) - 1:
+                    if (
+                        not isinstance(succ_node, SequenceElement)
+                        and not isinstance(edge, SequenceElement)
+                    ):
+                        continue
+                    else:
+                        add_node(pred_node)
+                else:
+                    add_node(pred_node)
+                    add_node(succ_node)
+            message_passing_dicts.append(cls.to_torch(dict(message_passing_dict), device))
+        return message_passing_dicts
+
+    @classmethod
+    def build_edge_information_passing(
+        cls,
+        edge_sequences: List[List[Tuple[SequenceElement, Optional[SequenceElement], Optional[SequenceElement]]]],
+        device: torch.device
+    ) -> List[Dict[str, torch.Tensor]]:
+        """ Returns the indice mappings required to perform pass edge information in between
+            language model blocks of an autoregressive language model for nodes in a serialized
+            graph
+        """
+        message_passing_dicts = []
+        for edge_sequence in edge_sequences:
+            message_passing_dict = {'tokens2elements': [], 'elements2tokens': [], 'edge_index': []}
+            node2edge_idxs = defaultdict(list)
+            add_edge = partial(
+                cls.add_edge,
+                end_idx=cls.get_sequence_end(edge_sequence),
+                node2edge_idxs=node2edge_idxs,
+                message_passing_dict=message_passing_dict
+            )
+            for sequenced_edge in edge_sequence[:-1]:
+                add_edge(sequenced_edge)
+
+            # calculating adjacency matrix between edges (edges in this adjacency matrix always
+            # point from edges earlier in the serialized version of the graph to edges later in
+            # the graph)
+            for edge_idxs in node2edge_idxs.values():
+                if len(edge_idxs) < 2:
+                    continue
+                for (idx0, idx1) in itertools.combinations(list(set(edge_idxs)), 2):
+                    message_passing_dict['edge_index'].append(sorted([idx0, idx1]))
+            message_passing_dicts.append(cls.to_torch(dict(message_passing_dict), device))
+        return message_passing_dicts
+
+    @staticmethod
+    def get_sequence_end(
+        edge_sequence: List[Tuple[SequenceElement, Optional[SequenceElement], Optional[SequenceElement]]],
+    ) -> Tuple[int, int]:
+        """ Returns last index + 1 of elements in the serialized graph sequence """
+        pred_node, edge, succ_node = edge_sequence[-1]
+        if isinstance(succ_node, SequenceElement):
+            end_idx = succ_node.end_idx
+        elif isinstance(edge, SequenceElement):
+            end_idx = edge.end_idx
+        else:
+            end_idx = pred_node.end_idx
+        return end_idx
+
+    @classmethod
+    def add_node(
+        cls,
+        current_occurence: SequenceElement,
+        end_idx: int,
+        last_occurence_idx: Dict[Tuple[int], int],
+        message_passing_dict: Dict[str, Union[List[int], List[List[int]]]]
+    ):
+        """ Each time a node is listed in a serialized version of its corresponding graph, it is
+            added as a node in a new artificial graph. This means in the new artificial graph, a
+            node in the original graph may appear more than once. For every node added to the
+            artificial graph, this function adds an edge which maps between occurences of
+            the same node in the original graph if the node has been printed previously in the
+            serialized graph. The edge points from the previous occurence to the current occurence.
+            i.e. H_1 - O, O - H_2, would create an edge from O -> O since it occurs more than
+            once in the graph
+        """
+        prev_length = len(message_passing_dict[f"tokens2elements"])
+        cls.add_element_for_information_passing(
+            start_idx=current_occurence.end_idx,
+            end_idx=end_idx,
+            message_passing_dict=message_passing_dict
+        )
+        curr_length = len(message_passing_dict[f"tokens2elements"])
+        if last_occurence_idx[current_occurence.ids] != -1 and curr_length > prev_length:
+            current_idx = len(message_passing_dict["tokens2elements"]) - 1
+            message_passing_dict['edge_index'].append(
+                [last_occurence_idx[current_occurence.ids], current_idx]
+            )
+            last_occurence_idx[current_occurence.ids] = current_idx
+
+    @classmethod
+    def add_edge(
+        cls,
+        sequenced_edge: Tuple[SequenceElement, SequenceElement, SequenceElement],
+        end_idx: int,
+        node2edge_idxs: Dict[Tuple[int], List[int]],
+        message_passing_dict: Dict[str, Union[List[int], List[List[int]]]]
+    ):
+        """ Adds an edge as element to pass information between in a serialized graph """
+        pred_node, _, succ_node = sequenced_edge
+        prev_length = len(message_passing_dict[f"tokens2elements"])
+        cls.add_element_for_information_passing(
+            start_idx=succ_node.end_idx,
+            end_idx=end_idx,
+            message_passing_dict=message_passing_dict
+        )
+        curr_length = len(message_passing_dict[f"tokens2elements"])
+        if curr_length > prev_length:
+            current_idx = len(message_passing_dict["tokens2elements"]) - 1
+            node2edge_idxs[pred_node.ids].append(current_idx)
+            node2edge_idxs[succ_node.ids].append(current_idx)
+
+    @staticmethod
+    def add_element_for_information_passing(
+        start_idx: int,
+        end_idx: int,
+        message_passing_dict: Dict[str, Union[List[int], List[List[int]]]]
+    ):
+        """ Adds an element to the message passing dictionary, the element is either a node
+            or an edge. Adding the element means adding the necessary indices to the mapping
+            tokens2elements and elements2tokens, so that it is possible to map to elements
+            and back
+        """
+        if start_idx != end_idx:
+            message_passing_dict[f"tokens2elements"].append(start_idx - 1)
+            for sequence_idx in range(start_idx, end_idx):
+                message_passing_dict[f"elements2tokens"].append(
+                    [len(message_passing_dict[f"tokens2elements"]) - 1, sequence_idx]
+                )
+
+    @staticmethod
+    def to_torch(
+        array_dict: Dict[str, Union[List[int], List[List[int]]]],
+        device: torch.device
+    ) -> Dict[str, torch.Tensor]:
+        """ Converts a dictionary of lists of integers to a dictionary of torch Tensor and returns it
+        """
+        for key, array in array_dict.items():
+            if len(array) == 0 or isinstance(array[0], int):
+                array_dict[key] = torch.from_numpy(np.array(array)).long().to(device)
+            else:
+                array_dict[key] = torch.from_numpy(np.array(array).transpose(1, 0)).long().to(device)
+        if array_dict['elements2tokens'].numel() > 0:
+            array_dict['slice_idxs'] = torch.from_numpy(np.array([
+                array_dict['elements2tokens'][1].min().item(),
+                array_dict['elements2tokens'][1].max().item() + 1
+            ])).long().to(device)
+        return array_dict