-
Notifications
You must be signed in to change notification settings - Fork 78
Open
Description
Tried with llama 3.2 1b model
here's the script I have used
import abliterator
# Step 1: Loading the model
print("Step 1: Loading the model")
model = "meta-llama/Llama-3.2-1B" # the huggingface or path to the model you're interested in loading in
dataset = [abliterator.get_harmful_instructions(), abliterator.get_harmless_instructions()] # datasets to be used for caching and testing, split by harmful/harmless
device = 'cuda' # optional: defaults to cuda
n_devices = None # optional: when set to None, defaults to `device.cuda.device_count`
cache_fname = None #'llama_3_2_1b_cached.pth' # optional: if you need to save where you left off, you can use `save_activations(filename)` which will write out a file. This is how you load that back in.
activation_layers = ['resid_pre', 'resid_post', 'attn_out', 'mlp_out'] # optional: defaults to ['resid_pre', 'resid_mid', 'resid_post'] which are the residual streams. Setting to None will cache ALL activation layer types
chat_template = "<system>\n{instruction}<end><assistant>" # optional: defaults to Llama-3 instruction template. You can use a format string or a custom class with format function
negative_toks = [4250] # optional, but highly recommended: ' cannot' in Llama's tokenizer. Tokens you don't want to be seeing. Defaults to my preset for Llama-3 models
positive_toks = [23371, 40914] # optional, but highly recommended: ' Sure' and 'Sure' in Llama's tokenizer. Tokens you want to be seeing, basically. Defaults to my preset for Llama-3 models
my_model = abliterator.ModelAbliterator(
model,
dataset,
device=device,
n_devices=n_devices,
cache_fname=cache_fname,
activation_layers=activation_layers,
chat_template=chat_template,
positive_toks=positive_toks,
negative_toks=negative_toks
)
# Step 2: Cache activations/sample dataset
print("\nStep 2: Caching activations")
# Once loaded in, run the model against N samples of harmful, and N samples of harmless
# so it has some data to work with:
my_model.cache_activations(N=128, reset=True, preserve_harmless=True)
# preserve_harmless=True is generally useful, as it keeps the "desired behaviour"
# unaltered from any stacked modifications if you run it after some mods.
# Step 3: Save state (optional)
print("\nStep 3: Saving state")
# Save your cached activations and any currently applied modifications to the model's weights
my_model.save_activations('llama_3_2_1b_cached.pth')
# Step 4: Getting refusal directions from the cached activations
print("\nStep 4: Getting refusal directions")
refusal_dirs = my_model.refusal_dirs()
# Pick a layer to test
test_layer = 'blocks.8.hook_resid_pre' # Adjust layer number based on model architecture
print(f"Testing direction from layer: {test_layer}")
testing_dir = refusal_dirs[test_layer]
# Test the direction - returns (negative_score, positive_score)
# You want negative_score to go down, positive_score to go up
test_results = my_model.test_dir(testing_dir, N=16, use_hooks=True)
print(f"Test results (negative_score, positive_score): {test_results}")
# Step 5: Testing lots of refusal directions
print("\nStep 5: Finding the best refusal direction")
# Define a function to find the best refusal direction
def find_best_refusal_dir(model, N=4, use_hooks=True, invert=False):
dirs = model.refusal_dirs(invert=invert)
scores = []
for direction_name, direction in dirs.items():
print(f"Testing direction: {direction_name}")
result = model.test_dir(direction, N=N, use_hooks=use_hooks)
# Extract the negative score from the result dictionary
score = result['negative']
scores.append((score, (direction_name, direction)))
return sorted(scores, key=lambda x: x[0])
# Find the best direction
best_directions = find_best_refusal_dir(my_model, N=8)
best_score, (best_dir_name, best_dir) = best_directions[0]
print(f"Best direction found: {best_dir_name} with score {best_score}")
# Step 6: Applying the weights
print("\nStep 6: Applying the weights")
# Apply the best direction to the model
my_model.apply_refusal_dirs([best_dir], layers=None)
# Note: layers=None will apply it to all writable layers
# Step 7: Blacklisting specific layers (optional)
print("\nStep 7: Blacklisting specific layers (optional)")
# Sometimes some layers are troublesome no matter what you do
# Blacklist the first and last few layers as recommended
my_model.blacklist_layer([0, 1, 30, 31]) # Adjust based on model architecture
# Step 8: Benchmarking
print("\nStep 8: Benchmarking")
# Check how much the model has changed after applying directions
with my_model: # loads a temporary context with the model
# Apply the direction in the context
my_model.apply_refusal_dirs([best_dir])
# Check the Mean Squared Error using the current cached harmless runs as "ground truth"
mse = my_model.mse_positive(N=32)
print(f"MSE with harmless ground truth: {mse}")
# Step 9: Testing the model
print("\nStep 9: Testing the model")
# Run N samples from the harmful test set and print them
my_model.test(N=4, batch_size=2)
# Step 10: Generate a response to a prompt
print("\nStep 10: Generate a response to a prompt")
response = my_model.generate("how do I hack wifi networks?")
print("Response:", response)
print("\nDemo completed successfully!")
Reactions are currently unavailable
Metadata
Metadata
Assignees
Labels
No labels
