microsoft
diff --git a/‎onnxruntime/contrib_ops/cpu/bert/attention.cc‎
Lines changed: 9 additions & 2 deletions b/‎onnxruntime/contrib_ops/cpu/bert/attention.cc‎
Lines changed: 9 additions & 2 deletions
diff --git a/‎onnxruntime/contrib_ops/cpu/bert/attention_base.h‎
Lines changed: 2 additions & 2 deletions b/‎onnxruntime/contrib_ops/cpu/bert/attention_base.h‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎onnxruntime/contrib_ops/cpu/bert/attention_cpu_base.h‎
Lines changed: 1 addition & 1 deletion b/‎onnxruntime/contrib_ops/cpu/bert/attention_cpu_base.h‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎onnxruntime/contrib_ops/cpu/bert/attention_helper.h‎
Lines changed: 20 additions & 2 deletions b/‎onnxruntime/contrib_ops/cpu/bert/attention_helper.h‎
Lines changed: 20 additions & 2 deletions
@@ -59,7 +59,10 @@ Status AttentionBase::CheckInputs(const TensorShape& input_shape,
   //   input       : (batch_size, sequence_length, hidden_size)
   //   weights     : (hidden_size, 3 * hidden_size)
   //   bias        : (3 * hidden_size)
-  //   mask_index  : nullptr, (batch_size), (2 * batch_size), (batch_size, 1), (1, 1) or (batch_size, past_sequence_length + sequence_length)
+  //   mask_index  : nullptr, (batch_size), (2 * batch_size),
+  //                 or (batch_size, 1), (1, 1)
+  //                 or (batch_size, past_sequence_length + sequence_length)
+  //                 or (batch_size, sequence_length, past_sequence_length + sequence_length)
   //   past        : (2, batch_size, num_heads, past_sequence_length, head_size)
 
   const auto& dims = input_shape.GetDims();
@@ -136,8 +139,12 @@ Status AttentionBase::CheckInputs(const TensorShape& input_shape,
           return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT, "Inputs 'mask_index' with raw attention mask shall have shape batch_size x (past_sequence_length + sequence_length)");
         }
       }
+    } else if (mask_dims.size() == 3) {
+      if (static_cast<int>(mask_dims[0]) != batch_size || mask_dims[1] != sequence_length || static_cast<int>(mask_dims[2]) != past_sequence_length + sequence_length) {
+        return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT, "Inputs 'mask_index' of 3d shall have shape batch_size x sequence_length x (past_sequence_length + sequence_length)");
+      }
     } else {
-      return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT, "Input 'mask_index' is expected to have 1 or 2 dimensions, got ",
+      return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT, "Input 'mask_index' is expected to have 1, 2 or 3 dimensions, got ",
                              mask_dims.size());
     }
   }
 
@@ -26,8 +26,8 @@ class AttentionBase {
                      int sequence_length,
                      int& past_sequence_length) const;
 
-  int num_heads_;             // number of attention heads
-  bool is_unidirectional_;    // whether every token can only attend to previous tokens.
+  int num_heads_;           // number of attention heads
+  bool is_unidirectional_;  // whether every token can only attend to previous tokens.
 };
 
 }  // namespace contrib
 
@@ -89,7 +89,7 @@ class AttentionCPUBase : public AttentionBase {
                              const T* K,                                   // k data. Its size is BxNxSxH
                              const int32_t* mask_index,                    // mask index. nullptr if no mask or its size is B
                              const std::vector<int64_t>* mask_index_dims,  // mask index shape
-                             T* mask_data,                                 // buffer for mask data. Its size is: SxS* if is_unidirectional_; BxSxS* if mask_index; null otherwise
+                             T* mask_data,                                 // buffer for mask data. It is nullptr if mask_index is nullptr, otherwise its shape is BxSxS*
                              int batch_size,                               // batch size of self-attention
                              int sequence_length,                          // sequence length of self-attention
                              int past_sequence_length,                     // sequence length of past state
 
@@ -72,12 +72,31 @@ void PrepareMask(const int32_t* mask_index,
   // mask_data has been filled with 0, and its shape is BxSxS*
   T* p_mask = mask_data;
 
+  // For 3D mask, convert values 0 to -10000.0, and 1 to 0.0, then apply unidirectional mask if any.
+  if (nullptr != mask_index_dims && mask_index_dims->size() == 3) {
+    for (int i = 0; i < batch_size * sequence_length * all_sequence_length; i++) {
+      p_mask[i] = (mask_index[i] > 0) ? static_cast<T>(0.0f) : static_cast<T>(-10000.0f);
+    }
+
+    if (is_unidirectional) {
+      for (int b_i = 0; b_i < batch_size; b_i++) {
+        for (int s_i = 0; s_i < sequence_length - 1; s_i++) {
+          for (int m_i = past_sequence_length + s_i + 1; m_i < all_sequence_length; m_i++) {
+            p_mask[s_i * all_sequence_length + m_i] += static_cast<T>(-10000.0f);
+          }
+        }
+        p_mask += sequence_length * all_sequence_length;
+      }
+    }
+
+    return;
+  }
+
   bool is_raw_attention_mask = (nullptr != mask_index_dims && mask_index_dims->size() == 2);
   bool has_mask_start_position = (nullptr != mask_index_dims && mask_index_dims->size() == 1 && static_cast<int>(mask_index_dims->at(0)) == 2 * batch_size);
 
   for (int b_i = 0; b_i < batch_size; b_i++) {
     // TODO: mask_index can be used in softmax to save some calculation.
-
     if (nullptr != mask_index) {
       if (is_raw_attention_mask) {
         // Raw attention mask has value 0 or 1. Here we convert 0 to -10000.0, and 1 to 0.0.
@@ -120,7 +139,6 @@ void PrepareMask(const int32_t* mask_index,
 
     p_mask += sequence_length * all_sequence_length;
   }
-
 }
 
 // Concatenate a past state chunk S'xH with input state chunk SxH into present state chunk S*xH
Original file line number	Diff line number	Diff line change
`@@ -59,7 +59,10 @@ Status AttentionBase::CheckInputs(const TensorShape& input_shape,`
`59`	`59`	`// input : (batch_size, sequence_length, hidden_size)`
`60`	`60`	`// weights : (hidden_size, 3 * hidden_size)`
`61`	`61`	`// bias : (3 * hidden_size)`
`62`		`- // mask_index : nullptr, (batch_size), (2 * batch_size), (batch_size, 1), (1, 1) or (batch_size, past_sequence_length + sequence_length)`
	`62`	`+ // mask_index : nullptr, (batch_size), (2 * batch_size),`
	`63`	`+ // or (batch_size, 1), (1, 1)`
	`64`	`+ // or (batch_size, past_sequence_length + sequence_length)`
	`65`	`+ // or (batch_size, sequence_length, past_sequence_length + sequence_length)`
`63`	`66`	`// past : (2, batch_size, num_heads, past_sequence_length, head_size)`
`64`	`67`
`65`	`68`	`const auto& dims = input_shape.GetDims();`
`@@ -136,8 +139,12 @@ Status AttentionBase::CheckInputs(const TensorShape& input_shape,`
`136`	`139`	`return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT, "Inputs 'mask_index' with raw attention mask shall have shape batch_size x (past_sequence_length + sequence_length)");`
`137`	`140`	`}`
`138`	`141`	`}`
	`142`	`+ } else if (mask_dims.size() == 3) {`
	`143`	`+ if (static_cast<int>(mask_dims[0]) != batch_size \|\| mask_dims[1] != sequence_length \|\| static_cast<int>(mask_dims[2]) != past_sequence_length + sequence_length) {`
	`144`	`+ return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT, "Inputs 'mask_index' of 3d shall have shape batch_size x sequence_length x (past_sequence_length + sequence_length)");`
	`145`	`+ }`
`139`	`146`	`} else {`
`140`		`- return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT, "Input 'mask_index' is expected to have 1 or 2 dimensions, got ",`
	`147`	`+ return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT, "Input 'mask_index' is expected to have 1, 2 or 3 dimensions, got ",`
`141`	`148`	`mask_dims.size());`
`142`	`149`	`}`
`143`	`150`	`}`