Replace manual implementation of CLIPAttention with MultiHeadAttention

james77777778 · james77777778 · commit 2620e4c7cae3 · 2024-08-25T13:00:30.000+08:00
diff --git a/keras_nlp/src/models/stable_diffusion_v3/clip_attention.py b/keras_nlp/src/models/stable_diffusion_v3/clip_attention.py
@@ -15,97 +15,47 @@
 from keras import ops
 
 
-class CLIPAttention(layers.Layer):
-    def __init__(self, num_heads, hidden_dim, dropout=0.0, **kwargs):
-        super().__init__(**kwargs)
-        if hidden_dim % num_heads != 0:
-            raise ValueError(
-                "`hidden_dim` must be divisible by num_heads. "
-                f"Received: num_heads={num_heads}, hidden_dim={hidden_dim}"
-            )
-        self.num_heads = num_heads
-        self.hidden_dim = hidden_dim
-        self.dropout = dropout
-        self.head_dim = self.hidden_dim // self.num_heads
-
-        self.dropout_layer = layers.Dropout(self.dropout)
-        self.scale = self.head_dim**-0.5
-        self.query_dense = layers.Dense(
-            units=self.hidden_dim, dtype=self.dtype_policy, name="query"
-        )
-        self.key_dense = layers.Dense(
-            units=self.hidden_dim, dtype=self.dtype_policy, name="key"
-        )
-        self.value_dense = layers.Dense(
-            units=self.hidden_dim, dtype=self.dtype_policy, name="value"
-        )
-        self.softmax = layers.Softmax(dtype="float32")
-        self.output_dense = layers.Dense(
-            units=self.hidden_dim,
-            dtype=self.dtype_policy,
-            name="attention_output",
-        )
-
-    def build(self, input_shape):
-        self.query_dense.build(input_shape)
-        self.key_dense.build(input_shape)
-        self.value_dense.build(input_shape)
-        self.output_dense.build([None, None, self.hidden_dim])
-
-    def compute_output_shape(self, input_shape):
-        output_shape = list(input_shape)
-        output_shape[-1] = self.hidden_dim
-        return output_shape
-
-    def _transpose_for_scores(self, inputs):
-        batch_size = ops.shape(inputs)[0]
-        inputs = ops.reshape(
-            inputs, (batch_size, -1, self.num_heads, self.head_dim)
-        )
-        return ops.transpose(inputs, axes=[0, 2, 1, 3])
-
-    def call(self, x, attention_mask=None, training=None):
-        batch_size = ops.shape(x)[0]
-        query = self.query_dense(x)
-        key = self.key_dense(x)
-        value = self.value_dense(x)
-        query = self._transpose_for_scores(query)
-        key = self._transpose_for_scores(key)
-        value = self._transpose_for_scores(value)
-
-        attention_logits = ops.matmul(
-            query, ops.transpose(key, axes=[0, 1, 3, 2])
-        )
-        dk = ops.cast(ops.sqrt(self.head_dim), dtype=attention_logits.dtype)
-        attention_logits = ops.divide(attention_logits, dk)
-
-        if attention_mask is not None:
-            attention_logits = ops.add(attention_logits, attention_mask)
-
-        orig_dtype = attention_logits.dtype
-        attention_softmax = self.softmax(attention_logits)
-        attention_softmax = ops.cast(attention_softmax, orig_dtype)
-
-        if self.dropout:
-            attention_softmax = self.dropout_layer(
-                attention_softmax, training=training
-            )
-
-        attention_output = ops.matmul(attention_softmax, value)
-        attention_output = ops.transpose(attention_output, axes=[0, 2, 1, 3])
-        attention_output = ops.reshape(
-            attention_output, (batch_size, -1, self.hidden_dim)
-        )
-        attention_output = self.output_dense(attention_output)
-        return attention_output
-
-    def get_config(self):
-        config = super().get_config()
-        config.update(
-            {
-                "num_heads": self.num_heads,
-                "hidden_dim": self.hidden_dim,
-                "dropout": self.dropout,
-            }
-        )
-        return config
+class CLIPAttention(layers.MultiHeadAttention):
+    def __init__(
+        self,
+        num_heads,
+        key_dim,
+        value_dim=None,
+        dropout=0.0,
+        use_bias=True,
+        output_shape=None,
+        attention_axes=None,
+        kernel_initializer="glorot_uniform",
+        bias_initializer="zeros",
+        kernel_regularizer=None,
+        bias_regularizer=None,
+        activity_regularizer=None,
+        kernel_constraint=None,
+        bias_constraint=None,
+        seed=None,
+        **kwargs,
+    ):
+        super().__init__(
+            num_heads,
+            key_dim,
+            value_dim,
+            dropout,
+            use_bias,
+            output_shape,
+            attention_axes,
+            kernel_initializer,
+            bias_initializer,
+            kernel_regularizer,
+            bias_regularizer,
+            activity_regularizer,
+            kernel_constraint,
+            bias_constraint,
+            seed,
+            **kwargs,
+        )
+
+    def _masked_softmax(self, attention_scores, attention_mask=None):
+        # In CLIP model, the computation of `attention_mask` is performed
+        # differently from `MultiHeadAttention`.
+        attention_scores = ops.add(attention_scores, attention_mask)
+        return self._softmax(attention_scores)
diff --git a/keras_nlp/src/models/stable_diffusion_v3/clip_encoder_block.py b/keras_nlp/src/models/stable_diffusion_v3/clip_encoder_block.py
@@ -33,6 +33,11 @@ def __init__(
         **kwargs,
     ):
         super().__init__(**kwargs)
+        if hidden_dim % num_heads != 0:
+            raise ValueError(
+                "`hidden_dim` must be divisible by `num_heads`. "
+                f"Received: hidden_dim={hidden_dim}, num_heads={num_heads}"
+            )
         self.hidden_dim = hidden_dim
         self.num_heads = num_heads
         self.intermediate_dim = intermediate_dim
@@ -45,8 +50,8 @@ def __init__(
             epsilon=0.00001, dtype=self.dtype_policy, name="layer_norm_1"
         )
         self.attention = CLIPAttention(
-            self.num_heads,
-            self.hidden_dim,
+            num_heads,
+            hidden_dim // num_heads,
             dtype=self.dtype_policy,
             name="attention",
         )
@@ -65,7 +70,7 @@ def __init__(
 
     def build(self, input_shape):
         self.layer_norm_1.build(input_shape)
-        self.attention.build(input_shape)
+        self.attention.build(input_shape, input_shape, input_shape)
         self.layer_norm_2.build(input_shape)
         self.dense_1.build(input_shape)
         input_shape = self.dense_1.compute_output_shape(input_shape)
@@ -85,7 +90,7 @@ def _compute_attention(self, x, attention_mask=None, training=None):
                 else None
             )
             mask = attention_mask
-        return self.attention(x, attention_mask=mask, training=training)
+        return self.attention(x, x, x, attention_mask=mask, training=training)
 
     def call(self, x, attention_mask=None, training=None):
         residual = x