Lots of small cleanups

thedch · thedch · commit cb8609bbea5f · 2024-07-30T09:31:31.000-07:00
diff --git a/autoencoder/feature-browser/build_website.py b/autoencoder/feature-browser/build_website.py
@@ -20,7 +20,7 @@
 """
 
 import logging
-from tqdm.auto import trange, tqdm
+from tqdm.auto import trange
 from dataclasses import dataclass
 import torch
 from tensordict import TensorDict
@@ -37,8 +37,8 @@
 # hyperparameters
 # data and model
 dataset = 'openwebtext'
-gpt_ckpt_dir = 'out'
-sae_ckpt_dir = 0.0  # subdirectory containing the specific model to consider
+gpt_ckpt_dir = 'MUST_BE_PROVIDED'
+sae_ckpt_dir = 'MUST_BE_PROVIDED'  # subdirectory containing the specific model to consider
 # feature page hyperparameter
 num_contexts = 10000
 num_sampled_tokens = 10  # number of tokens in each context on which feature activations will be computed
@@ -50,29 +50,29 @@
 gpt_batch_size = 156
 num_phases = 52  # due to memory constraints, it's useful to process features in phases.
 # system
-device = 'cuda'  # change it to cpu
+device = 'mps'
 # reproducibility
 seed = 1442
 
 
 @dataclass
 class FeatureBrowserConfig:
     # dataset and model
-    dataset: str = "openwebtext"
-    gpt_ckpt_dir: str = "out"
-    sae_ckpt_dir: str = "out"
+    dataset: str
+    gpt_ckpt_dir: str
+    sae_ckpt_dir: str
     # feature browser hyperparameters
-    num_contexts: int = int(1e6)
-    num_sampled_tokens: int = 10
-    window_radius: int = 4
-    num_top_activations: int = 10
-    num_intervals: int = 12
-    samples_per_interval: int = 5
+    num_contexts: int
+    num_sampled_tokens: int
+    window_radius: int
+    num_top_activations: int
+    num_intervals: int
+    samples_per_interval: int
     # processing hyperparameters
-    seed: int = 0
-    device: str = "cpu"
-    gpt_batch_size: int = 156
-    num_phases: int = 52
+    seed: int
+    device: str
+    gpt_batch_size: int
+    num_phases: int
 
 
 class FeatureBrowser(ResourceLoader):
@@ -86,7 +86,6 @@ def __init__(self, config):
         )
 
         # retrieve feature browser hyperparameters from config
-        # self.num_contexts = config.num_contexts
         self.num_sampled_tokens = config.num_sampled_tokens
         self.window_radius = config.window_radius
         self.num_top_activations = num_top_activations
@@ -136,11 +135,15 @@ def build(self):
 
         for phase in trange(self.num_phases, desc='processing features in phases'):
             feature_start_idx = phase * self.num_features_per_phase
-            feature_end_idx = min((phase + 1) * self.num_features_per_phase, self.n_features)  # 4096 features in the latent space of the autoencoder
-            # logging.info(f'working on features # {feature_start_idx} - {feature_end_idx} in phase {phase + 1}/{self.num_phases}')
+            feature_end_idx = min((phase + 1) * self.num_features_per_phase, self.n_features)
+
+            if feature_start_idx >= feature_end_idx:
+                # TODO: Adjust the feature selection logic so this never happens, just use more_itertools
+                continue
+
             context_window_data = self.compute_context_window_data(feature_start_idx, feature_end_idx)
             top_acts_data = self.compute_top_activations(context_window_data)
-            for h in trange(0, feature_end_idx - feature_start_idx, desc='making histograms'):
+            for h in trange(0, feature_end_idx - feature_start_idx, desc='making histograms', disable=True):
                 # make and save histogram of logits for this feature
                 feature_id = phase * self.num_features_per_phase + h
                 make_logits_histogram(logits=self.attributed_logits[feature_id, :], feature_id=feature_id, dirpath=self.html_out)
@@ -156,9 +159,7 @@ def compute_context_window_data(self, feature_start_idx, feature_end_idx):
         This should probably also include feature ablations."""
         context_window_data = self._initialize_context_window_data(feature_start_idx, feature_end_idx)
 
-        for iter in trange(self.num_batches, desc='computing feature activations per batch'):
-            # if iter % 20 == 0:
-                # logging.info(f"computing feature activations for batches {iter+1}-{min(iter+20, self.num_batches)}/{self.num_batches}")
+        for iter in trange(self.num_batches, desc='computing feature activations per batch', disable=True):
             batch_start_idx = iter * self.gpt_batch_size
             batch_end_idx = (iter + 1) * self.gpt_batch_size
             x, feature_activations, logits_difference_storage = self._compute_batch_feature_activations(
@@ -329,10 +330,12 @@ def _sample_context_windows(self, *args, fn_seed=0):
 
         result_tensors = []
         for tensor in args:
+            assert tensor.numel() > 0, "Tensor has 0 elements"
+
             if tensor.ndim == 3:
                 L = tensor.shape[2]
                 sliced_tensor = tensor[batch_idx, window_idx, :]  # (B, S, W, L)
-                sliced_tensor = sliced_tensor.view(-1, self.window_length, L)  # (B *S , W, L)
+                sliced_tensor = sliced_tensor.view(-1, self.window_length, L)  # (B*S , W, L)
             elif tensor.ndim == 2:
                 sliced_tensor = tensor[batch_idx, window_idx]  # (B, S, W)
                 sliced_tensor = sliced_tensor.view(-1, self.window_length)  # (B*S, W)
@@ -375,9 +378,9 @@ def _compute_batch_feature_activations(self, batch_start_idx, batch_end_idx, fea
         # TODO: do I need to center the median at 0 before computing differences?
         # Otherwise, the probability of sampling the token can probably not be compared through the logit weight alone.
         logits_difference_storage = torch.zeros(B, T, H, device=self.device)  # (B, T, H)
-        for h in trange(H, desc='computing logits with replacement tensor'):
+        for h in trange(H, desc='computing logits with replacement tensor', disable=True):
             # on CPU, each forward pass takes ~12 seconds
-            # on MPS, each forward pass takes ~200 ms (!!!)
+            # on MPS, each forward pass takes ~200 ms (!!)
             feat_ablation_logits, _ = self.transformer(x, y, mode="replace", replacement_tensor=feature_ablations[:, :, :, h])  # (B, T, V)
 
             logits_difference = original_logits - feat_ablation_logits  # (B, T, V)
diff --git a/autoencoder/resource_loader.py b/autoencoder/resource_loader.py
@@ -96,7 +96,7 @@ def init_autoencoder_data_info(self):
     def load_autoencoder_model(self):
         """Loads the AutoEncoder model with pre-trained weights"""
         autoencoder_path = os.path.join(self.base_dir, "autoencoder", "out", self.dataset, self.sae_ckpt_dir)
-        autoencoder_ckpt = torch.load(os.path.join(autoencoder_path, 'ckpt.pt'), map_location=self.device)
+        autoencoder_ckpt = torch.load(os.path.join(autoencoder_path, 'ckpt.pt'), map_location=self.device, weights_only=False)
         state_dict = autoencoder_ckpt['autoencoder']
         n_features, n_ffwd = state_dict['encoder.weight'].shape  # H, F
         l1_coeff = autoencoder_ckpt['config']['l1_coeff']
diff --git a/autoencoder/train_autoencoder.py b/autoencoder/train_autoencoder.py
@@ -18,6 +18,7 @@
 dataset = 'openwebtext'
 gpt_ckpt_dir = 'out'
 # training
+# TODO: Rename n_features to be latent_multiple, and base it off the gpt model
 n_features = 4096  # aka n_latents
 batch_size = 8192  # batch size for autoencoder training
 l1_coeff = 3e-3
@@ -135,7 +136,7 @@
             _, nll_loss = gpt(x, y)
             mlp_acts = gpt.mlp_activation_hooks[0]
             gpt.clear_mlp_activation_hooks()  # free up memory
-            _, ablated_loss = gpt(x, y, mode="replace")
+            # _, ablated_loss = gpt(x, y, mode="replace")
 
             with torch.no_grad():
                 autoencoder_output = autoencoder(mlp_acts)
@@ -151,7 +152,7 @@
             log_dict['losses/autoencoder_loss'] += autoencoder_output['loss'].item()
             log_dict['losses/reconstruction_loss'] += autoencoder_output['mse_loss'].item()
             log_dict['losses/l1_norm'] += autoencoder_output['l1_loss'].item()
-            log_dict['losses/nll_score'] += (nll_loss - reconstructed_nll).item() / (nll_loss - ablated_loss).item()
+            # log_dict['losses/nll_score'] += (nll_loss - reconstructed_nll).item() / (nll_loss - ablated_loss).item()
 
         # compute feature densities and plot feature density histogram
         log_feat_acts_density = np.log10(
diff --git a/transformer/config/train_gpt2.py b/transformer/config/train_gpt2.py
diff --git a/transformer/model.py b/transformer/model.py
@@ -8,7 +8,6 @@
 """
 
 import math
-import inspect
 from dataclasses import dataclass
 
 import torch
@@ -167,99 +166,6 @@ def _init_weights(self, module):
         elif isinstance(module, nn.Embedding):
             torch.nn.init.normal_(module.weight, mean=0.0, std=0.02)
 
-    def forward(self, idx, targets=None):
-        device = idx.device
-        b, t = idx.size()
-        assert t <= self.config.block_size, f"Cannot forward sequence of length {t}, block size is only {self.config.block_size}"
-        pos = torch.arange(0, t, dtype=torch.long, device=device) # shape (t)
-
-        # forward the GPT model itself
-        tok_emb = self.transformer.wte(idx) # token embeddings of shape (b, t, n_embd)
-        pos_emb = self.transformer.wpe(pos) # position embeddings of shape (t, n_embd)
-        x = self.transformer.drop(tok_emb + pos_emb)
-        for block in self.transformer.h:
-            x = block(x)
-        x = self.transformer.ln_f(x)
-
-        if targets is not None:
-            # if we are given some desired targets also calculate the loss
-            logits = self.lm_head(x)
-            loss = F.cross_entropy(logits.view(-1, logits.size(-1)), targets.view(-1), ignore_index=-1)
-        else:
-            # inference-time mini-optimization: only forward the lm_head on the very last position
-            logits = self.lm_head(x[:, [-1], :]) # note: using list [-1] to preserve the time dim
-            loss = None
-
-        return logits, loss
-
-    def crop_block_size(self, block_size):
-        # model surgery to decrease the block size if necessary
-        # e.g. we may load the GPT2 pretrained model checkpoint (block size 1024)
-        # but want to use a smaller block size for some smaller, simpler model
-        assert block_size <= self.config.block_size
-        self.config.block_size = block_size
-        self.transformer.wpe.weight = nn.Parameter(self.transformer.wpe.weight[:block_size])
-        for block in self.transformer.h:
-            if hasattr(block.attn, 'bias'):
-                block.attn.bias = block.attn.bias[:,:,:block_size,:block_size]
-
-    @classmethod
-    def from_pretrained(cls, model_type, override_args=None):
-        assert model_type in {'gpt2', 'gpt2-medium', 'gpt2-large', 'gpt2-xl'}
-        override_args = override_args or {} # default to empty dict
-        # only dropout can be overridden see more notes below
-        assert all(k == 'dropout' for k in override_args)
-        from transformers import GPT2LMHeadModel
-        print("loading weights from pretrained gpt: %s" % model_type)
-
-        # n_layer, n_head and n_embd are determined from model_type
-        config_args = {
-            'gpt2':         dict(n_layer=12, n_head=12, n_embd=768),  # 124M params
-            'gpt2-medium':  dict(n_layer=24, n_head=16, n_embd=1024), # 350M params
-            'gpt2-large':   dict(n_layer=36, n_head=20, n_embd=1280), # 774M params
-            'gpt2-xl':      dict(n_layer=48, n_head=25, n_embd=1600), # 1558M params
-        }[model_type]
-        print("forcing vocab_size=50257, block_size=1024, bias=True")
-        config_args['vocab_size'] = 50257 # always 50257 for GPT model checkpoints
-        config_args['block_size'] = 1024 # always 1024 for GPT model checkpoints
-        config_args['bias'] = True # always True for GPT model checkpoints
-        # we can override the dropout rate, if desired
-        if 'dropout' in override_args:
-            print(f"overriding dropout rate to {override_args['dropout']}")
-            config_args['dropout'] = override_args['dropout']
-        # create a from-scratch initialized minGPT model
-        config = GPTConfig(**config_args)
-        model = GPT(config)
-        sd = model.state_dict()
-        sd_keys = sd.keys()
-        sd_keys = [k for k in sd_keys if not k.endswith('.attn.bias')] # discard this mask / buffer, not a param
-
-        # init a huggingface/transformers model
-        model_hf = GPT2LMHeadModel.from_pretrained(model_type)
-        sd_hf = model_hf.state_dict()
-
-        # copy while ensuring all of the parameters are aligned and match in names and shapes
-        sd_keys_hf = sd_hf.keys()
-        sd_keys_hf = [k for k in sd_keys_hf if not k.endswith('.attn.masked_bias')] # ignore these, just a buffer
-        sd_keys_hf = [k for k in sd_keys_hf if not k.endswith('.attn.bias')] # same, just the mask (buffer)
-        transposed = ['attn.c_attn.weight', 'attn.c_proj.weight', 'mlp.c_fc.weight', 'mlp.c_proj.weight']
-        # basically the openai checkpoints use a "Conv1D" module, but we only want to use a vanilla Linear
-        # this means that we have to transpose these weights when we import them
-        assert len(sd_keys_hf) == len(sd_keys), f"mismatched keys: {len(sd_keys_hf)} != {len(sd_keys)}"
-        for k in sd_keys_hf:
-            if any(k.endswith(w) for w in transposed):
-                # special treatment for the Conv1D weights we need to transpose
-                assert sd_hf[k].shape[::-1] == sd[k].shape
-                with torch.no_grad():
-                    sd[k].copy_(sd_hf[k].t())
-            else:
-                # vanilla copy over the other parameters
-                assert sd_hf[k].shape == sd[k].shape
-                with torch.no_grad():
-                    sd[k].copy_(sd_hf[k])
-
-        return model
-
     def configure_optimizers(self, weight_decay, learning_rate, betas):
         # start with all of the candidate parameters
         param_dict = {pn: p for pn, p in self.named_parameters()}
@@ -278,12 +184,7 @@ def configure_optimizers(self, weight_decay, learning_rate, betas):
         print(f"num decayed parameter tensors: {len(decay_params)}, with {num_decay_params:,} parameters")
         print(f"num non-decayed parameter tensors: {len(nodecay_params)}, with {num_nodecay_params:,} parameters")
 
-        # Create AdamW optimizer and use the fused version if it is available
-        # fused_available = 'fused' in inspect.signature(torch.optim.AdamW).parameters
-        # use_fused = fused_available and device_type == 'cuda'
-        # extra_args = dict(fused=True) if use_fused else dict()
         optimizer = torch.optim.AdamW(optim_groups, lr=learning_rate, betas=betas, fused=False)
-        # print(f"using fused AdamW: {use_fused}")
 
         return optimizer
 
diff --git a/transformer/train_transformer.py b/transformer/train_transformer.py
@@ -153,37 +153,8 @@ def get_batch(split):
 
 elif init_from == 'resume':
     raise DeprecationWarning('init from is deprecated')
-    print(f"Resuming training from {out_dir}")
-    # resume training from a checkpoint.
-    ckpt_path = os.path.join(out_dir, 'ckpt.pt')
-    checkpoint = torch.load(ckpt_path, map_location=device)
-    checkpoint_model_args = checkpoint['model_args']
-    # force these config attributes to be equal otherwise we can't even resume training
-    # the rest of the attributes (e.g. dropout) can stay as desired from command line
-    for k in ['n_layer', 'n_head', 'n_embd', 'block_size', 'bias', 'vocab_size']:
-        model_args[k] = checkpoint_model_args[k]
-    # create the model
-    gptconf = GPTConfig(**model_args)
-    model = GPT(gptconf)
-    state_dict = checkpoint['model']
-    # fix the keys of the state dictionary :(
-    # honestly no idea how checkpoints sometimes get this prefix, have to debug more
-    unwanted_prefix = '_orig_mod.'
-    for k,v in list(state_dict.items()):
-        if k.startswith(unwanted_prefix):
-            state_dict[k[len(unwanted_prefix):]] = state_dict.pop(k)
-    model.load_state_dict(state_dict)
-    iter_num = checkpoint['iter_num']
-    best_val_loss = checkpoint['best_val_loss']
 elif init_from.startswith('gpt2'):
     raise DeprecationWarning('init from is deprecated')
-    print(f"Initializing from OpenAI GPT-2 weights: {init_from}")
-    # initialize from OpenAI GPT-2 weights
-    override_args = dict(dropout=dropout)
-    model = GPT.from_pretrained(init_from, override_args)
-    # read off the created config params, so we can store them into checkpoint correctly
-    for k in ['n_layer', 'n_head', 'n_embd', 'block_size', 'bias', 'vocab_size']:
-        model_args[k] = getattr(model.config, k)
 
 # crop down the model block size if desired, using model surgery
 if block_size < model.config.block_size: