Skip to content

Commit

Permalink
I've totally lost my place, so i'm just going to checkpoint and rebas…
Browse files Browse the repository at this point in the history
…e later
  • Loading branch information
mitchellgordon95 committed Nov 7, 2019
1 parent 0888263 commit 1fd8be2
Show file tree
Hide file tree
Showing 42 changed files with 1,071 additions and 179 deletions.
3 changes: 2 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -126,4 +126,5 @@ dmypy.json

*.o*

models/
models/
*.png
2 changes: 1 addition & 1 deletion checkpoint_utils/common.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,4 +6,4 @@ def prune(tensor, sparsity):
tensor = np.abs(tensor)
thresh_ind = int(tensor.size * sparsity)
threshold = np.partition(tensor.flatten(), thresh_ind)[thresh_ind]
return tensor < threshold
return tensor > threshold
28 changes: 24 additions & 4 deletions checkpoint_utils/compare_features.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,10 @@
import fire
import json
import numpy as np
import sys

def main(first_fname, second_fname):
def layers(first_fname, second_fname):
"""Returns an iterator over pairs of layer activations from BERT feature files"""
first_file = open(first_fname)
second_file = open(second_fname)
for first_line, second_line in zip(first_file, second_file):
Expand All @@ -17,10 +19,28 @@ def main(first_fname, second_fname):

first_vec = np.array(first_layer['values'])
second_vec = np.array(second_layer['values'])
# dist = np.linalg.norm(first_vec - second_vec)
cos = 1 - np.dot(first_vec, second_vec) / (np.linalg.norm(first_vec) * np.linalg.norm(second_vec))

print(f"{first_feature['token']} ({first_layer['index']}): {cos}")
yield first_vec, second_vec

def main(first_fname, second_fname):
total_sim = 0
layer_count = 0

for first_vec, second_vec in layers(first_fname, second_fname):
# dist = np.linalg.norm(first_vec - second_vec)
cos = np.dot(first_vec, second_vec) / (np.linalg.norm(first_vec) * np.linalg.norm(second_vec))

total_sim += cos
layer_count += 1
if layer_count % 10000 == 0:
print(layer_count, file=sys.stderr)
if layer_count > 300000:
print('Passed 300k layers, ignoring the rest.', file=sys.stderr)
break
# print(f"{first_feature['token']} ({first_layer['index']}): {cos}")

print(f'{total_sim} / {layer_count} = {total_sim / layer_count}')


if __name__ == '__main__':
fire.Fire(main)
51 changes: 51 additions & 0 deletions checkpoint_utils/memory_breakdown.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,51 @@
import tensorflow as tf
import numpy as np
import re
import sys
import os
from shutil import copyfile

if len(sys.argv) != 2:
print("Usage: memory_breakdown.py [pretrain-dir]")
exit()

with tf.Session() as sess:

# Load all the variables from the checkpoint
total = 0
embeddings = 0
attention = 0
FC = 0
other = 0
masks = 0
cls = 0
for var_name, _ in tf.train.list_variables(sys.argv[1]):
tensor = tf.contrib.framework.load_variable(sys.argv[1], var_name)

total += tensor.size

if var_name.endswith('/mask'):
masks += tensor.size
elif var_name.startswith('cls'):
cls += tensor.size
elif 'embeddings/word_embeddings' in var_name:
embeddings += tensor.size
elif '/attention/' in var_name:
attention += tensor.size
elif '/intermediate' in var_name or '/output/' in var_name:
FC += tensor.size
else:
other += tensor.size

total -= masks
print(f"""
Embeds: {embeddings} ({int(embeddings/total * 100)}%)
Attention: {attention} ({int(attention/total * 100)}%)
FC: {FC} ({int(FC/total * 100)}%)
cls: {cls} ({int(cls/total * 100)}%)
other: {other} ({int(other/total * 100)}%)
Total: {total}
(masks: {masks})
""")

24 changes: 14 additions & 10 deletions checkpoint_utils/prune_attn_heads.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,13 +18,12 @@ def params_for_attn(ledger, layer, masks=False):
ledger[f'bert/encoder/layer_{layer}/attention/output/fully_connected/{end}'],
)


def extract_single_head(key, query, value, FC, head_ind):
assert all([tensor.shape == (768, 768) for tensor in [key, query, value, FC]])

return tuple(tensor[:][SIZE_PER_HEAD*head_ind:SIZE_PER_HEAD*(head_ind+1)]
return tuple(tensor[:,SIZE_PER_HEAD*head_ind:SIZE_PER_HEAD*(head_ind+1)]
for tensor in [key, query, value]
) + (FC[SIZE_PER_HEAD*head_ind:SIZE_PER_HEAD*(head_ind+1)][:],)
) + (FC[SIZE_PER_HEAD*head_ind:SIZE_PER_HEAD*(head_ind+1),:],)


def attn_head_weight(key, query, value, FC, head_ind):
Expand All @@ -39,9 +38,9 @@ def prune_single_head(ledger, layer, head):
key, query, value, FC = params_for_attn(ledger, layer, masks=True)

for tensor in [key, query, value]:
tensor[:][SIZE_PER_HEAD*head:SIZE_PER_HEAD*(head+1)] = 1
tensor[:,SIZE_PER_HEAD*head:SIZE_PER_HEAD*(head+1)] = 0

FC[SIZE_PER_HEAD*head:SIZE_PER_HEAD*(head+1)][:] = 1
FC[SIZE_PER_HEAD*head:SIZE_PER_HEAD*(head+1),:] = 0


def prune_attn_heads(model_dir, sparsity: float):
Expand All @@ -58,20 +57,25 @@ def prune_attn_heads(model_dir, sparsity: float):
ledger[var_name] = tf.contrib.framework.load_variable(model_dir, var_name)

head_weights = np.zeros((12, 12)) # 12 layers, 12 heads each
# layer_stds = np.zeros(12) # 12 layers
for layer in range(12):
params = params_for_attn(ledger, layer)
# layer_stds[layer] = np.std(np.concatenate(params).flatten())
for head in range(12):
print(layer, head)
params = params_for_attn(ledger, layer)
head_weights[layer,head] = attn_head_weight(*params, head)

# layer_stds /= np.linalg.norm(layer_stds, keepdims=True)

# TODO: normalize by layer
layer_sums = np.sum(head_weights, axis=1, keepdims=True)
head_weights /= layer_sums
layer_norms = np.sum(head_weights, axis=1, keepdims=True)
# layer_norms = np.linalg.norm(head_weights, axis=1, keepdims=True)
head_weights /= layer_norms
# head_weights *= layer_stds
mask = prune(head_weights, sparsity)

# Non-zero gives us the indices of non-zero elements like
# ([row indices], [column indices])
to_prune = np.nonzero(mask)
to_prune = np.nonzero(mask == 0)
for layer, head in zip(to_prune[0], to_prune[1]):
prune_single_head(ledger, layer, head)

Expand Down
38 changes: 38 additions & 0 deletions checkpoint_utils/random_masks.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,38 @@
import tensorflow as tf
import numpy as np
import re
import sys
import os
from shutil import copyfile
import fire

def random_masks(model_dir, sparsity: float):
"""Prunes a random [sparsity] of of weights in each matrix of [model_dir].
Makes a new checkpoint [model_dir]_random_prune_[sparsity].
"""
model_dir = model_dir.rstrip('/')

with tf.Session() as sess:

# Load all the variables from the checkpoint
for var_name, _ in tf.train.list_variables(model_dir):
tensor = tf.contrib.framework.load_variable(model_dir, var_name)

if var_name.endswith('/mask'):
num_zeros = int(tensor.size * sparsity)
new_mask = np.concatenate((np.zeros(num_zeros), np.ones(tensor.size - num_zeros)))
np.random.shuffle(new_mask)
new_mask = new_mask.reshape(tensor.shape).astype(tensor.dtype)
var = tf.Variable(new_mask, name=var_name)
else:
var = tf.Variable(tensor, name=var_name)

# Save these new variables
saver = tf.train.Saver()
sess.run(tf.global_variables_initializer())
output_dir = model_dir + f"_random_prune_{int(sparsity*100)}"
os.mkdir(output_dir)
saver.save(sess, os.path.join(output_dir, 'random_prune.ckpt'))

if __name__ == '__main__':
fire.Fire(random_masks)
31 changes: 0 additions & 31 deletions checkpoint_utils/weight_graph_change.py

This file was deleted.

Loading

0 comments on commit 1fd8be2

Please sign in to comment.