Skip to content

after abliteration still getting same responses #30

@someshfengde

Description

@someshfengde

Tried with llama 3.2 1b model

here's the script I have used

import abliterator

# Step 1: Loading the model
print("Step 1: Loading the model")
model = "meta-llama/Llama-3.2-1B"  # the huggingface or path to the model you're interested in loading in
dataset = [abliterator.get_harmful_instructions(), abliterator.get_harmless_instructions()] # datasets to be used for caching and testing, split by harmful/harmless
device = 'cuda'                             # optional: defaults to cuda
n_devices = None                            # optional: when set to None, defaults to `device.cuda.device_count`
cache_fname =  None #'llama_3_2_1b_cached.pth'     # optional: if you need to save where you left off, you can use `save_activations(filename)` which will write out a file. This is how you load that back in.
activation_layers = ['resid_pre', 'resid_post', 'attn_out', 'mlp_out']  # optional: defaults to ['resid_pre', 'resid_mid', 'resid_post'] which are the residual streams. Setting to None will cache ALL activation layer types
chat_template = "<system>\n{instruction}<end><assistant>"  # optional: defaults to Llama-3 instruction template. You can use a format string or a custom class with format function
negative_toks = [4250]                      # optional, but highly recommended: ' cannot' in Llama's tokenizer. Tokens you don't want to be seeing. Defaults to my preset for Llama-3 models
positive_toks = [23371, 40914]              # optional, but highly recommended: ' Sure' and 'Sure' in Llama's tokenizer. Tokens you want to be seeing, basically. Defaults to my preset for Llama-3 models

my_model = abliterator.ModelAbliterator(
    model,
    dataset,
    device=device,
    n_devices=n_devices,
    cache_fname=cache_fname,
    activation_layers=activation_layers,
    chat_template=chat_template,
    positive_toks=positive_toks,
    negative_toks=negative_toks
)

# Step 2: Cache activations/sample dataset
print("\nStep 2: Caching activations")
# Once loaded in, run the model against N samples of harmful, and N samples of harmless
# so it has some data to work with:
my_model.cache_activations(N=128, reset=True, preserve_harmless=True)
# preserve_harmless=True is generally useful, as it keeps the "desired behaviour" 
# unaltered from any stacked modifications if you run it after some mods.

# Step 3: Save state (optional)
print("\nStep 3: Saving state")
# Save your cached activations and any currently applied modifications to the model's weights
my_model.save_activations('llama_3_2_1b_cached.pth')

# Step 4: Getting refusal directions from the cached activations
print("\nStep 4: Getting refusal directions")
refusal_dirs = my_model.refusal_dirs()
# Pick a layer to test
test_layer = 'blocks.8.hook_resid_pre'  # Adjust layer number based on model architecture
print(f"Testing direction from layer: {test_layer}")
testing_dir = refusal_dirs[test_layer]
# Test the direction - returns (negative_score, positive_score)
# You want negative_score to go down, positive_score to go up
test_results = my_model.test_dir(testing_dir, N=16, use_hooks=True)
print(f"Test results (negative_score, positive_score): {test_results}")

# Step 5: Testing lots of refusal directions
print("\nStep 5: Finding the best refusal direction")
# Define a function to find the best refusal direction
def find_best_refusal_dir(model, N=4, use_hooks=True, invert=False):
    dirs = model.refusal_dirs(invert=invert)
    scores = []
    for direction_name, direction in dirs.items():
        print(f"Testing direction: {direction_name}")
        result = model.test_dir(direction, N=N, use_hooks=use_hooks)
        # Extract the negative score from the result dictionary
        score = result['negative']
        scores.append((score, (direction_name, direction)))
    return sorted(scores, key=lambda x: x[0])

# Find the best direction
best_directions = find_best_refusal_dir(my_model, N=8)
best_score, (best_dir_name, best_dir) = best_directions[0]
print(f"Best direction found: {best_dir_name} with score {best_score}")

# Step 6: Applying the weights
print("\nStep 6: Applying the weights")
# Apply the best direction to the model
my_model.apply_refusal_dirs([best_dir], layers=None)
# Note: layers=None will apply it to all writable layers

# Step 7: Blacklisting specific layers (optional)
print("\nStep 7: Blacklisting specific layers (optional)")
# Sometimes some layers are troublesome no matter what you do
# Blacklist the first and last few layers as recommended
my_model.blacklist_layer([0, 1, 30, 31])  # Adjust based on model architecture

# Step 8: Benchmarking
print("\nStep 8: Benchmarking")
# Check how much the model has changed after applying directions
with my_model:  # loads a temporary context with the model
    # Apply the direction in the context
    my_model.apply_refusal_dirs([best_dir])
    # Check the Mean Squared Error using the current cached harmless runs as "ground truth"
    mse = my_model.mse_positive(N=32)
    print(f"MSE with harmless ground truth: {mse}")

# Step 9: Testing the model
print("\nStep 9: Testing the model")
# Run N samples from the harmful test set and print them
my_model.test(N=4, batch_size=2)

# Step 10: Generate a response to a prompt
print("\nStep 10: Generate a response to a prompt")
response = my_model.generate("how do I hack wifi networks?")
print("Response:", response)

print("\nDemo completed successfully!")

Image

Metadata

Metadata

Assignees

No one assigned

    Labels

    No labels
    No labels

    Projects

    No projects

    Milestone

    No milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions