Improve Test Vector Generation (#8)

* [fix] Fix wrong bitwidth of weights * [feature] Print warnings for problematic vectors * [change] Reduce calculated bit width of AV - This assumes that the attention is distributed among different values and not on one token * [change] Remove outdated comment
pulp-platform · Nov 4, 2024 · 774c356 · 774c356
1 parent 598b424
commit 774c356
Show file tree

Hide file tree

Showing 3 changed files with 71 additions and 17 deletions.
diff --git a/PyITA/ITA.py b/PyITA/ITA.py
@@ -26,7 +26,8 @@
 from .gelu import gelu_requantize, i_gelu_requantized, get_i_gelu_constants, get_i_gelu_requantized_constants
 from .util import (generate_matrix_mem, pack_8b_to_word, pack_array_8b_to_word, pack_hex_24b, pack_multihead_8b_to_word,
                    pack_multihead_24b_to_word, random_shuffled_tensor, requantize, split_matrix, to_hex, write_matrix,
-                   write_matrix_mem, write_matrix_mem_hex, write_vector_mem_hex, get_almost_symmetric_scaling_factor)
+                   write_matrix_mem, write_matrix_mem_hex, write_vector_mem_hex, get_almost_symmetric_scaling_factor,
+                   error_MAEP)
 
 
 class Transformer:
@@ -133,35 +134,35 @@ def _initialize_tensors(self, Q, V, Wq, Wk, Wv, Wo, Bq, Bk, Bv, Bo, FF_in, Wff,
 
         self.exp_sum = np.zeros(self.S, dtype = np.int32)
 
-        self.Q_in = random_shuffled_tensor((self.S, self.E), self.WI - 1) if Q is None else Q
+        self.Q_in = random_shuffled_tensor((self.S, self.E), self.WI) if Q is None else Q
         self.Q = np.pad(self.Q_in, ((0, self.S_ITA - self.S), (0, self.E_ITA - self.E)))
 
-        self.V_in = random_shuffled_tensor((self.S, self.E), self.WI - 1) if V is None else V
+        self.V_in = random_shuffled_tensor((self.S, self.E), self.WI) if V is None else V
         self.V = np.pad(self.V_in, ((0, self.S_ITA - self.S), (0, self.E_ITA - self.E)))
 
         # WIESEP: K is the same as V because we do cross-attention
         self.K_in = self.V_in
         self.K = self.V
 
-        self.FF_in = random_shuffled_tensor((self.S, self.E), self.WI - 1) if FF_in is None else FF_in
+        self.FF_in = random_shuffled_tensor((self.S, self.E), self.WI) if FF_in is None else FF_in
         self.FF = np.pad(self.FF_in, ((0, self.S_ITA - self.S), (0, self.E_ITA - self.E)))
 
         #### Weight matrices ####
-        self.Wq_in = random_shuffled_tensor((self.H, self.E, self.P), self.WI - 1) if Wq is None else Wq
+        self.Wq_in = random_shuffled_tensor((self.H, self.E, self.P), self.WI) if Wq is None else Wq
         self.Wq = np.pad(self.Wq_in, ((0, 0), (0, self.E_ITA - self.E), (0, self.P_ITA - self.P)))
 
-        self.Wk_in = random_shuffled_tensor((self.H, self.E, self.P), self.WI - 1) if Wk is None else Wk
+        self.Wk_in = random_shuffled_tensor((self.H, self.E, self.P), self.WI) if Wk is None else Wk
         self.Wk = np.pad(self.Wk_in, ((0, 0), (0, self.E_ITA - self.E), (0, self.P_ITA - self.P)))
 
-        self.Wv_in = random_shuffled_tensor((self.H, self.E, self.P), self.WI - 1) if Wv is None else Wv
+        self.Wv_in = random_shuffled_tensor((self.H, self.E, self.P), self.WI) if Wv is None else Wv
         self.Wv = np.pad(self.Wv_in, ((0, 0), (0, self.E_ITA - self.E), (0, self.P_ITA - self.P)))
 
-        self.Wo_in = random_shuffled_tensor((self.H, self.P, self.E), self.WI - 1) if Wo is None else Wo
+        self.Wo_in = random_shuffled_tensor((self.H, self.P, self.E), self.WI) if Wo is None else Wo
         self.Wo = np.pad(self.Wo_in, ((0, 0), (0, self.P_ITA - self.P), (0, self.E_ITA - self.E)))
 
-        self.Wff_in = random_shuffled_tensor((1, self.E, self.F), self.WI - 1) if Wff is None else Wff
+        self.Wff_in = random_shuffled_tensor((1, self.E, self.F), self.WI) if Wff is None else Wff
         self.Wff = np.pad(self.Wff_in, ((0, 0), (0, self.E_ITA - self.E), (0, self.F_ITA - self.F)))
-        self.Wff2_in = random_shuffled_tensor((1, self.F, self.E), self.WI - 1) if Wff2 is None else Wff2
+        self.Wff2_in = random_shuffled_tensor((1, self.F, self.E), self.WI) if Wff2 is None else Wff2
         self.Wff2 = np.pad(self.Wff2_in, ((0, 0), (0, self.F_ITA - self.F), (0, self.E_ITA - self.E)))
 
         #### Bias matrices ####
@@ -258,7 +259,7 @@ def _initialize_quantization_parameters(self):
             elif i == 3:  # QK
                 max_bit_width = np.log2(self.requant_eps_mult[i, :].astype(np.uint32) * self.P * 2**8).astype(np.uint32)
             elif i == 4:  # AV
-                max_bit_width = np.log2(self.requant_eps_mult[i, :].astype(np.uint32) * self.S * 2**8).astype(np.uint32)
+                max_bit_width = np.log2(self.requant_eps_mult[i, :].astype(np.uint32) * self.S * 2**5).astype(np.uint32)
             elif i == 5:  # OW
                 max_bit_width = np.log2(self.requant_eps_mult[i, :].astype(np.uint32) * self.E * 2**9).astype(np.uint32)
             elif i == 6:  # Sum OW
@@ -996,17 +997,53 @@ def generateTestVectors(path, **kwargs):
     acc1.export_hwpe()
     acc1.export_numpy()
 
-    def print_tensor_stats(tensor):
+    def calculate_tensor_stats(tensor, name, tol = 1e-1):
+        # Calculate the similarly of elements within one row and over all columns
+        similarity_row = np.mean(np.abs(np.diff(tensor, axis = -2)))
+        similarity_column = np.mean(np.abs(np.diff(tensor, axis = -1)))
+
+        if (similarity_row < tol) or (similarity_column < tol):
+            if name is not None:
+                print(f"WARNING: {name} is constant!")
+                print(f"{name} Mean-Squared Difference (row)   : {similarity_row:5.1f}")
+                print(f"{name} Mean-Squared Difference (column): {similarity_column:5.1f}")
+                raise ValueError(f"Tensor {name} is constant! This is a bad test vector!")
+            else:
+                print("    WARNING: Tensor is constant!")
+                print(f"    Mean-Squared Difference (row)   : {similarity_row:5.1f}")
+                print(f"    Mean-Squared Difference (column): {similarity_column:5.1f}")
+
+        return similarity_row, similarity_column
+
+    def print_tensor_stats(tensor, name = None):
         print(f"    Min: {np.min(tensor)}")
         print(f"    Max: {np.max(tensor)}")
 
-        # Calculate the simmilarty of elements witin one row and over all comumns
-        similarity_row = np.mean(np.abs(np.diff(tensor, axis = -2)))
-        similarity_column = np.mean(np.abs(np.diff(tensor, axis = -1)))
+        similarity_row, similarity_column = calculate_tensor_stats(tensor, name)
 
         print(f"    Mean-Squared Difference (row)   : {similarity_row:5.1f}")
         print(f"    Mean-Squared Difference (column): {similarity_column:5.1f}")
 
+    # Calculate all tensor statistics
+    tensors = {
+        "Qp": acc1.Qp_requant,
+        "Kp": acc1.Kp_requant,
+        "Vp": acc1.Vp_requant,
+        "A": acc1.A_requant,
+        "A_soft": acc1.A_partial_softmax,
+        "O_soft": acc1.O_soft_requant,
+        "Out_soft": acc1.Out_soft_requant,
+        "Out_soft_sum": acc1.Out_soft_sum_requant
+    }
+
+    for name, tensor in tensors.items():
+        calculate_tensor_stats(tensor, name)
+
+    # Check if softmax is sufficiently precise
+    maep_softmax = error_MAEP(acc1.A_partial_softmax, acc1.A_real_softmax)
+    if maep_softmax > 5:
+        print(f"WARNING: Softmax is not precise enough! MAEP Error to Integer Softmax: {maep_softmax:.2f}%")
+
     if kwargs['verbose'] > 1:
         print("=> Qp")
         print_tensor_stats(acc1.Qp_requant)
@@ -1038,6 +1075,7 @@ def print_tensor_stats(tensor):
 
         print("=> A (partial softmax)")
         print_tensor_stats(acc1.A_partial_softmax)
+        print(f"    MAEP Error to Integer Softmax: {maep_softmax:.2f}%")
         if kwargs['verbose'] > 3:
             print(acc1.A_partial_softmax)
 

diff --git a/PyITA/softmax.py b/PyITA/softmax.py
@@ -157,8 +157,6 @@ def streamingPartialSoftmax(x, integerize = True):
 
     ## STAGE 2: Calculate the softmax activation
     # Invert the partial sum
-    # WIESEP: Scale Softmax to 127
-    # The Softmax values are maximum 127 as sumdotp modules can only do signed-signed operations for now. This is a temporary fix until sumdotp is fixed.
     if integerize:
         exp_partial_sum_inverse = np.floor((2**8 - 1) * 2**8 / exp_partial_sum).astype(np.int32)
     else:

diff --git a/PyITA/util.py b/PyITA/util.py
@@ -517,3 +517,21 @@ def almost_symmetric_dequantize(quantized_activations: np.ndarray, clip_lo: f32,
     S, _ = get_almost_symmetric_scaling_factor(clip_lo, n_bits)
     activations = quantized_activations * S
     return activations
+
+
+def error_MAEP(a: np.ndarray, b: np.ndarray):
+    """
+    Compute the mean absolute error percentage (MAEP) between two tensors.
+    A value of 0 indicates that the two tensors are equal.
+    A value of 100 indicates that the second tensor is on average twice as large as the first tensor.
+
+    Parameters:
+        a (np.ndarray): The first tensor.
+        b (np.ndarray): The second tensor.
+
+    Returns:
+        np.ndarray: The mean absolute error percentage between the two tensors.
+    """
+    return 100 * np.mean(np.abs(a - b)) / max(
+        np.abs(np.max(a)) + np.abs(np.min(a)),
+        np.abs(np.max(b)) + np.abs(np.min(b)))