added logit histogram in feature browser

shehper · shehper · commit 53f521e4d376 · 2024-04-22T16:26:31.000-04:00
diff --git a/autoencoder/feature-browser/build_website.py b/autoencoder/feature-browser/build_website.py
@@ -30,7 +30,7 @@
 
 sys.path.insert(1, '../')
 from resource_loader import ResourceLoader
-from utils.plotting_utils import make_histogram
+from utils.plotting_utils import make_activations_histogram, make_logits_histogram
 
 # hyperparameters 
 # data and model
@@ -102,6 +102,15 @@ def __init__(self, config):
         self.html_out = os.path.join(os.path.dirname(os.path.abspath('.')), 'out', config.dataset, str(config.sae_ckpt_dir))        
         self.seed = config.seed
 
+        # create subdirectories to store logit histograms and feature activation histograms
+        os.makedirs(os.path.join(self.html_out, 'logits_histograms'), exist_ok=True)
+        os.makedirs(os.path.join(self.html_out, 'activations_histograms'), exist_ok=True)
+
+        # TODO: why are my logits of the order of 10, while Anthropic's are <1. Do they rescale them? 
+        # Or is it because of linear approximation to LayerNorm?
+
+        # self.attributed logits is a tensor of shape (n_features, vocab_size) containing logits for each feature
+        self.attributed_logits = self.compute_logits()
         self.top_logits, self.bottom_logits = self.compute_top_and_bottom_logits()
 
         print(f"Will process features in {self.num_phases} phases. Each phase will have forward pass in {self.num_batches} batches")
@@ -124,6 +133,12 @@ def build(self):
             context_window_data = self.compute_context_window_data(feature_start_idx, feature_end_idx)
             top_acts_data = self.compute_top_activations(context_window_data)
             for h in range(0, feature_end_idx-feature_start_idx):
+                # make and save histogram of logits for this feature        
+                feature_id = phase * self.num_features_per_phase + h
+                make_logits_histogram(logits=self.attributed_logits[feature_id, :],
+                                    feature_id=feature_id,
+                                    dirpath=self.html_out)
+                # write the page for this feature
                 self.write_feature_page(phase, h, context_window_data, top_acts_data)
 
             # if phase == 1:
@@ -184,31 +199,39 @@ def compute_top_activations(self, data):
         return top_activations_data
 
     @torch.no_grad()
-    def compute_top_and_bottom_logits(self,):
+    def compute_logits(self,):
         """
-        Computes top and bottom logits for each feature. 
-        Returns (top_logits, bottom_logits). Each is of type `torch.return_types.topk`.
-        It uses the full LayerNorm instead of its approximation. # TODO: How important is that?
-        # also, this function is specific to SAEs trained on the activations of last MLP layer for now.
+        Computes logits for each feature through path expansion approach.
+        Returns a torch tensor of shape (num_features, vocab_size)
+        By default, it uses full LayerNorm instead of its linear approximation. # TODO: understand if that's okay
+        # also, this function is specific to SAEs trained on the activations of last MLP layer for now. TODO: generalize this
+        By default, logits for each feature are shifted so that the median value is 0.
         """
         mlp_out = self.transformer.transformer.h[-1].mlp.c_proj(self.autoencoder.decoder.weight.detach().t()) # (L, C)
         ln_out = self.transformer.transformer.ln_f(mlp_out) # (L, C)
         logits = self.transformer.lm_head(ln_out) # (L, V)
-        shifted_logits = (logits - logits.median(dim=1, keepdim=True).values) # (L, V)
-
+        attributed_logits = (logits - logits.median(dim=1, keepdim=True).values) # (L, V)
+        return attributed_logits
+    
+    @torch.no_grad()
+    def compute_top_and_bottom_logits(self,):
+        """
+        Computes top and bottom logits for each feature. 
+        Returns (top_logits, bottom_logits). Each is of type `torch.return_types.topk`.
+        """
         # GPT-2 tokenizer has vocab size 50257. nanoGPT sets vocab size = 50304 for higher training speed.
         # See https://twitter.com/karpathy/status/1621578354024677377?lang=en
         # Decoder will give an error if a token with id > 50256 is given, and bottom_logits may pick one of these tokens. 
         # Therefore, set max token id to 50256 by hand. 
-        shifted_logits = shifted_logits[:, :50257]
-
-        top_logits = torch.topk(shifted_logits, largest=True, sorted=True, k=self.num_top_activations, dim=1) # (L, k)
-        bottom_logits = torch.topk(shifted_logits, largest=False, sorted=True, k=self.num_top_activations, dim=1) # (L, k)
+        attributed_logits = self.attributed_logits[:, :50257]
+        top_logits = torch.topk(attributed_logits, largest=True, sorted=True, k=self.num_top_activations, dim=1) # (L, k)
+        bottom_logits = torch.topk(attributed_logits, largest=False, sorted=True, k=self.num_top_activations, dim=1) # (L, k)
         return top_logits, bottom_logits 
 
     def write_feature_page(self, phase, h, data, top_acts_data):
         """"Writes features pages for dead / alive neurons; also makes a histogram.
         For alive features, it calls sample_and_write."""
+
         curr_feature_acts_MW = data["feature_acts"][:, :, h]
         mid_token_feature_acts_M = curr_feature_acts_MW[:, self.window_radius]
         num_nonzero_acts = torch.count_nonzero(mid_token_feature_acts_M)
@@ -220,11 +243,12 @@ def write_feature_page(self, phase, h, data, top_acts_data):
         
         act_density = torch.count_nonzero(curr_feature_acts_MW) / (self.total_sampled_tokens * self.window_length) * 100
         non_zero_acts = curr_feature_acts_MW[curr_feature_acts_MW != 0]
-        make_histogram(activations=non_zero_acts, 
+        make_activations_histogram(activations=non_zero_acts, 
                        density=act_density, 
                        feature_id=feature_id,
                        dirpath=self.html_out)
 
+
         if num_nonzero_acts < self.num_intervals * self.samples_per_interval:
             write_ultralow_density_feature_page(feature_id=feature_id, 
                                                 decode=self.decode,
diff --git a/autoencoder/feature-browser/subpages.py b/autoencoder/feature-browser/subpages.py
@@ -39,7 +39,7 @@ def write_feature_page_header():
         flex: 1; /* Make each column take up equal space */
         padding: 0 10px; /* Add some padding */
         border: 2px solid #ccc;
-        background-color: #f9f9f9;
+        background-color: #ffffff;
         }}
         h2 {{
             margin-bottom: 15px;
@@ -134,21 +134,20 @@ def write_activations_section(decode, examples_data):
         
         
 def include_feature_density_histogram(feature_id, dirpath=None):
-    if os.path.exists(os.path.join(dirpath, 'histograms', f'{feature_id}.png')):
+    if os.path.exists(os.path.join(dirpath, 'activations_histograms', f'{feature_id}.png')):
         feature_density_histogram = f"""
             <div class="image-container">
-                <img src="../histograms/{feature_id}.png" alt="Feature Activations Histogram">
+                <img src="../activations_histograms/{feature_id}.png" alt="Feature Activations Histogram">
             </div>"""
         return feature_density_histogram
     else:
         return ""
 
 def include_logits_histogram(feature_id, dirpath=None):
-    # TODO: replace histograms with logits in the path below.
-    if os.path.exists(os.path.join(dirpath, 'histograms', f'{feature_id}.png')):
+    if os.path.exists(os.path.join(dirpath, 'logits_histograms', f'{feature_id}.png')):
         logits_histogram = f"""
             <div class="second-image-container">
-                <img src="../histograms/{feature_id}.png" alt="Logits Histogram" width="100" height="50">
+                <img src="../logits_histograms/{feature_id}.png" alt="Logits Histogram" width="400" height="200">
             </div>
             </div>
             """
@@ -255,10 +254,10 @@ def write_ultralow_density_feature_page(feature_id, decode, top_acts_data, dirpa
     html_content.append(write_feature_page_header()) 
 
     # add histogram of feature activations
-    if os.path.exists(os.path.join(dirpath, 'histograms', f'{feature_id}.png')):
+    if os.path.exists(os.path.join(dirpath, 'activations_histograms', f'{feature_id}.png')):
         html_content.append(f"""<div class="content-container">
         <div class="image-container">
-            <img src="../histograms/{feature_id}.png" alt="Feature Activations Histogram">
+            <img src="../activations_histograms/{feature_id}.png" alt="Feature Activations Histogram">
         </div>""")
 
     # add feature #, and the information that it is an ultralow density neuron
diff --git a/autoencoder/resource_loader.py b/autoencoder/resource_loader.py
@@ -43,6 +43,7 @@ def __init__(self, dataset, gpt_ckpt_dir, device='cpu', mode="train", sae_ckpt_d
             assert sae_ckpt_dir, "A path to autoencoder checkpoint must be given"
             self.sae_ckpt_dir = sae_ckpt_dir
             self.autoencoder = self.load_autoencoder_model()
+            self.autoencoder.eval() # note that if we load an autoencoder to resume training, we must not do this
         
     def load_text_data(self):
         """Loads the text data from the specified dataset."""
diff --git a/autoencoder/train.py b/autoencoder/train.py
@@ -10,7 +10,7 @@
 import time
 from autoencoder import AutoEncoder
 from resource_loader import ResourceLoader
-from utils.plotting_utils import make_histogram_image
+from utils.plotting_utils import make_density_histogram
 
 ## hyperparameters
 # dataset and model
@@ -155,7 +155,7 @@
 
         # compute feature densities and plot feature density histogram
         log_feat_acts_density = np.log10(feat_acts_count[feat_acts_count != 0]/(eval_iters * eval_batch_size * gpt.config.block_size)) # (n_features,)
-        feat_density_historgram = make_histogram_image(log_feat_acts_density)
+        feat_density_historgram = make_density_histogram(log_feat_acts_density)
 
         # take mean of all loss values by dividing by the number of evaluation batches; also log more metrics
         log_dict = {key: val/eval_iters for key, val in log_dict.items()}
diff --git a/autoencoder/utils/__init__.py b/autoencoder/utils/__init__.py
@@ -1 +1 @@
-from .plotting_utils import make_histogram_image
+from .plotting_utils import make_density_histogram, make_activations_histogram, make_logits_histogram
diff --git a/autoencoder/utils/plotting_utils.py b/autoencoder/utils/plotting_utils.py
@@ -1,11 +1,17 @@
+"""
+Three different histogram functions. The difference lies in whether to save the histogram image on disk or not,
+color scheme and axes labels.
+These can perhaps be combined into one function, but leaving it as it is for now.
+"""
 import matplotlib.pyplot as plt
 from PIL import Image
 from io import BytesIO
 import torch
 import os
 
-def make_histogram_image(data, bins='auto'):
-    """Generates a histogram image from the provided data."""
+def make_density_histogram(data, bins='auto'):
+    """Makes a histogram image from the provided data and returns it.
+    We use it in train.py to plot feature density histograms and log them with W&B."""
     fig, ax = plt.subplots()
     ax.hist(data, bins=bins)
     ax.set_title('Histogram')
@@ -19,7 +25,9 @@ def make_histogram_image(data, bins='auto'):
     plt.close(fig)  # close the figure to free memory
     return image
 
-def make_histogram(activations, density, feature_id, dirpath=None):
+def make_activations_histogram(activations, density, feature_id, dirpath=None):
+    """makes a histogram of activations and saves it on the disk
+    we later include the histogram in the feature browser"""
     if isinstance(activations, torch.Tensor):
         activations = activations.cpu().numpy()
     plt.hist(activations, bins='auto')  # You can adjust the number of bins as needed
@@ -28,5 +36,20 @@ def make_histogram(activations, density, feature_id, dirpath=None):
     plt.ylabel('Frequency')
 
     # Save the histogram as an image
-    plt.savefig(os.path.join(dirpath, 'histograms', f'{feature_id}.png'))
+    image_path = os.path.join(dirpath, 'activations_histograms', f'{feature_id}.png')
+    plt.savefig(image_path)
+    plt.close()
+
+def make_logits_histogram(logits, feature_id, dirpath=None):
+    """
+    Makes a histogram of logits for a given feature and saves it as a PNG file
+    Input: 
+        logits: a torch tensor of shape (vocab_size,)
+        feature_id: int 
+        dirpath: histogram is saved as dirpath/logits_histograms/feature_id.png
+    """
+    plt.hist(logits, bins='auto')  # You can adjust the number of bins as needed
+
+    image_path = os.path.join(dirpath, 'logits_histograms', f'{feature_id}.png')
+    plt.savefig(image_path)
     plt.close()

Original file line number	Diff line number	Diff line change
`@@ -1 +1 @@`
`1`		`-from .plotting_utils import make_histogram_image`
	`1`	`+from .plotting_utils import make_density_histogram, make_activations_histogram, make_logits_histogram`