I've totally lost my place, so i'm just going to checkpoint and rebas…

…e later
mitchellgordon95 · Nov 7, 2019 · 1fd8be2 · 1fd8be2
1 parent 0888263
commit 1fd8be2
Show file tree

Hide file tree

Showing 42 changed files with 1,071 additions and 179 deletions.
diff --git a/.gitignore b/.gitignore
@@ -126,4 +126,5 @@ dmypy.json
 
 *.o*
 
-models/
+models/
+*.png
diff --git a/checkpoint_utils/common.py b/checkpoint_utils/common.py
@@ -6,4 +6,4 @@ def prune(tensor, sparsity):
     tensor = np.abs(tensor)
     thresh_ind = int(tensor.size * sparsity)
     threshold = np.partition(tensor.flatten(), thresh_ind)[thresh_ind]
-    return tensor < threshold
+    return tensor > threshold
diff --git a/checkpoint_utils/compare_features.py b/checkpoint_utils/compare_features.py
@@ -1,8 +1,10 @@
 import fire
 import json
 import numpy as np
+import sys
 
-def main(first_fname, second_fname):
+def layers(first_fname, second_fname):
+    """Returns an iterator over pairs of layer activations from BERT feature files"""
     first_file = open(first_fname)
     second_file = open(second_fname)
     for first_line, second_line in zip(first_file, second_file):
@@ -17,10 +19,28 @@ def main(first_fname, second_fname):
 
                 first_vec = np.array(first_layer['values'])
                 second_vec = np.array(second_layer['values'])
-                # dist = np.linalg.norm(first_vec - second_vec)
-                cos = 1 - np.dot(first_vec, second_vec) / (np.linalg.norm(first_vec) * np.linalg.norm(second_vec))
 
-                print(f"{first_feature['token']} ({first_layer['index']}): {cos}")
+                yield first_vec, second_vec
+
+def main(first_fname, second_fname):
+    total_sim = 0
+    layer_count = 0
+
+    for first_vec, second_vec in layers(first_fname, second_fname):
+        # dist = np.linalg.norm(first_vec - second_vec)
+        cos = np.dot(first_vec, second_vec) / (np.linalg.norm(first_vec) * np.linalg.norm(second_vec))
+
+        total_sim += cos
+        layer_count += 1
+        if layer_count % 10000 == 0:
+            print(layer_count, file=sys.stderr)
+        if layer_count > 300000:
+            print('Passed 300k layers, ignoring the rest.', file=sys.stderr)
+            break
+        # print(f"{first_feature['token']} ({first_layer['index']}): {cos}")
+
+    print(f'{total_sim} / {layer_count} = {total_sim / layer_count}')
+
 
 if __name__ == '__main__':
     fire.Fire(main)
diff --git a/checkpoint_utils/memory_breakdown.py b/checkpoint_utils/memory_breakdown.py
@@ -0,0 +1,51 @@
+import tensorflow as tf
+import numpy as np
+import re
+import sys
+import os
+from shutil import copyfile
+
+if len(sys.argv) != 2:
+    print("Usage: memory_breakdown.py [pretrain-dir]")
+    exit()
+
+with tf.Session() as sess:
+
+    # Load all the variables from the checkpoint
+    total = 0
+    embeddings = 0
+    attention = 0
+    FC = 0
+    other = 0
+    masks = 0
+    cls = 0
+    for var_name, _ in tf.train.list_variables(sys.argv[1]):
+        tensor = tf.contrib.framework.load_variable(sys.argv[1], var_name)
+
+        total += tensor.size
+
+        if var_name.endswith('/mask'):
+            masks += tensor.size
+        elif var_name.startswith('cls'):
+            cls += tensor.size
+        elif 'embeddings/word_embeddings' in var_name:
+            embeddings += tensor.size
+        elif '/attention/' in var_name:
+            attention += tensor.size
+        elif '/intermediate' in var_name or '/output/' in var_name:
+            FC += tensor.size
+        else:
+            other += tensor.size
+
+    total -= masks
+    print(f"""
+    Embeds: {embeddings} ({int(embeddings/total * 100)}%)
+    Attention: {attention} ({int(attention/total * 100)}%)
+    FC: {FC} ({int(FC/total * 100)}%)
+    cls: {cls} ({int(cls/total * 100)}%)
+    other: {other} ({int(other/total * 100)}%)
+    Total: {total}
+
+    (masks: {masks})
+    """)
+
diff --git a/checkpoint_utils/prune_attn_heads.py b/checkpoint_utils/prune_attn_heads.py
@@ -18,13 +18,12 @@ def params_for_attn(ledger, layer, masks=False):
         ledger[f'bert/encoder/layer_{layer}/attention/output/fully_connected/{end}'],
         )
 
-
 def extract_single_head(key, query, value, FC, head_ind):
     assert all([tensor.shape == (768, 768) for tensor in [key, query, value, FC]])
 
-    return tuple(tensor[:][SIZE_PER_HEAD*head_ind:SIZE_PER_HEAD*(head_ind+1)]
+    return tuple(tensor[:,SIZE_PER_HEAD*head_ind:SIZE_PER_HEAD*(head_ind+1)]
                  for tensor in [key, query, value]
-                 ) + (FC[SIZE_PER_HEAD*head_ind:SIZE_PER_HEAD*(head_ind+1)][:],)
+                 ) + (FC[SIZE_PER_HEAD*head_ind:SIZE_PER_HEAD*(head_ind+1),:],)
 
 
 def attn_head_weight(key, query, value, FC, head_ind):
@@ -39,9 +38,9 @@ def prune_single_head(ledger, layer, head):
     key, query, value, FC = params_for_attn(ledger, layer, masks=True)
 
     for tensor in [key, query, value]:
-        tensor[:][SIZE_PER_HEAD*head:SIZE_PER_HEAD*(head+1)] = 1
+        tensor[:,SIZE_PER_HEAD*head:SIZE_PER_HEAD*(head+1)] = 0
 
-    FC[SIZE_PER_HEAD*head:SIZE_PER_HEAD*(head+1)][:] = 1
+    FC[SIZE_PER_HEAD*head:SIZE_PER_HEAD*(head+1),:] = 0
 
 
 def prune_attn_heads(model_dir, sparsity: float):
@@ -58,20 +57,25 @@ def prune_attn_heads(model_dir, sparsity: float):
             ledger[var_name] = tf.contrib.framework.load_variable(model_dir, var_name)
 
         head_weights = np.zeros((12, 12)) # 12 layers, 12 heads each
+        # layer_stds = np.zeros(12) # 12 layers
         for layer in range(12):
+            params = params_for_attn(ledger, layer)
+            # layer_stds[layer] = np.std(np.concatenate(params).flatten())
             for head in range(12):
-                print(layer, head)
-                params = params_for_attn(ledger, layer)
                 head_weights[layer,head] = attn_head_weight(*params, head)
 
+        # layer_stds /= np.linalg.norm(layer_stds, keepdims=True)
+
         # TODO: normalize by layer
-        layer_sums = np.sum(head_weights, axis=1, keepdims=True)
-        head_weights /= layer_sums
+        layer_norms = np.sum(head_weights, axis=1, keepdims=True)
+        # layer_norms = np.linalg.norm(head_weights, axis=1, keepdims=True)
+        head_weights /= layer_norms
+        # head_weights *= layer_stds
         mask = prune(head_weights, sparsity)
 
         # Non-zero gives us the indices of non-zero elements like
         # ([row indices], [column indices])
-        to_prune = np.nonzero(mask)
+        to_prune = np.nonzero(mask == 0)
         for layer, head in zip(to_prune[0], to_prune[1]):
             prune_single_head(ledger, layer, head)
 

diff --git a/checkpoint_utils/random_masks.py b/checkpoint_utils/random_masks.py
@@ -0,0 +1,38 @@
+import tensorflow as tf
+import numpy as np
+import re
+import sys
+import os
+from shutil import copyfile
+import fire
+
+def random_masks(model_dir, sparsity: float):
+    """Prunes a random [sparsity] of of weights in each matrix of [model_dir].
+    Makes a new checkpoint [model_dir]_random_prune_[sparsity].
+    """
+    model_dir = model_dir.rstrip('/')
+
+    with tf.Session() as sess:
+
+        # Load all the variables from the checkpoint
+        for var_name, _ in tf.train.list_variables(model_dir):
+            tensor = tf.contrib.framework.load_variable(model_dir, var_name)
+
+            if var_name.endswith('/mask'):
+                num_zeros = int(tensor.size * sparsity)
+                new_mask = np.concatenate((np.zeros(num_zeros), np.ones(tensor.size - num_zeros)))
+                np.random.shuffle(new_mask)
+                new_mask = new_mask.reshape(tensor.shape).astype(tensor.dtype)
+                var = tf.Variable(new_mask, name=var_name)
+            else:
+                var = tf.Variable(tensor, name=var_name)
+
+        # Save these new variables
+        saver = tf.train.Saver()
+        sess.run(tf.global_variables_initializer())
+        output_dir = model_dir + f"_random_prune_{int(sparsity*100)}"
+        os.mkdir(output_dir)
+        saver.save(sess, os.path.join(output_dir, 'random_prune.ckpt'))
+
+if __name__ == '__main__':
+    fire.Fire(random_masks)
diff --git a/checkpoint_utils/weight_graph_change.py b/checkpoint_utils/weight_graph_change.py