after abliteration still getting same responses

Tried with llama 3.2 1b model 

here's the script I have used 
```
import abliterator

# Step 1: Loading the model
print("Step 1: Loading the model")
model = "meta-llama/Llama-3.2-1B"  # the huggingface or path to the model you're interested in loading in
dataset = [abliterator.get_harmful_instructions(), abliterator.get_harmless_instructions()] # datasets to be used for caching and testing, split by harmful/harmless
device = 'cuda'                             # optional: defaults to cuda
n_devices = None                            # optional: when set to None, defaults to `device.cuda.device_count`
cache_fname =  None #'llama_3_2_1b_cached.pth'     # optional: if you need to save where you left off, you can use `save_activations(filename)` which will write out a file. This is how you load that back in.
activation_layers = ['resid_pre', 'resid_post', 'attn_out', 'mlp_out']  # optional: defaults to ['resid_pre', 'resid_mid', 'resid_post'] which are the residual streams. Setting to None will cache ALL activation layer types
chat_template = "<system>\n{instruction}<end><assistant>"  # optional: defaults to Llama-3 instruction template. You can use a format string or a custom class with format function
negative_toks = [4250]                      # optional, but highly recommended: ' cannot' in Llama's tokenizer. Tokens you don't want to be seeing. Defaults to my preset for Llama-3 models
positive_toks = [23371, 40914]              # optional, but highly recommended: ' Sure' and 'Sure' in Llama's tokenizer. Tokens you want to be seeing, basically. Defaults to my preset for Llama-3 models

my_model = abliterator.ModelAbliterator(
    model,
    dataset,
    device=device,
    n_devices=n_devices,
    cache_fname=cache_fname,
    activation_layers=activation_layers,
    chat_template=chat_template,
    positive_toks=positive_toks,
    negative_toks=negative_toks
)

# Step 2: Cache activations/sample dataset
print("\nStep 2: Caching activations")
# Once loaded in, run the model against N samples of harmful, and N samples of harmless
# so it has some data to work with:
my_model.cache_activations(N=128, reset=True, preserve_harmless=True)
# preserve_harmless=True is generally useful, as it keeps the "desired behaviour" 
# unaltered from any stacked modifications if you run it after some mods.

# Step 3: Save state (optional)
print("\nStep 3: Saving state")
# Save your cached activations and any currently applied modifications to the model's weights
my_model.save_activations('llama_3_2_1b_cached.pth')

# Step 4: Getting refusal directions from the cached activations
print("\nStep 4: Getting refusal directions")
refusal_dirs = my_model.refusal_dirs()
# Pick a layer to test
test_layer = 'blocks.8.hook_resid_pre'  # Adjust layer number based on model architecture
print(f"Testing direction from layer: {test_layer}")
testing_dir = refusal_dirs[test_layer]
# Test the direction - returns (negative_score, positive_score)
# You want negative_score to go down, positive_score to go up
test_results = my_model.test_dir(testing_dir, N=16, use_hooks=True)
print(f"Test results (negative_score, positive_score): {test_results}")

# Step 5: Testing lots of refusal directions
print("\nStep 5: Finding the best refusal direction")
# Define a function to find the best refusal direction
def find_best_refusal_dir(model, N=4, use_hooks=True, invert=False):
    dirs = model.refusal_dirs(invert=invert)
    scores = []
    for direction_name, direction in dirs.items():
        print(f"Testing direction: {direction_name}")
        result = model.test_dir(direction, N=N, use_hooks=use_hooks)
        # Extract the negative score from the result dictionary
        score = result['negative']
        scores.append((score, (direction_name, direction)))
    return sorted(scores, key=lambda x: x[0])

# Find the best direction
best_directions = find_best_refusal_dir(my_model, N=8)
best_score, (best_dir_name, best_dir) = best_directions[0]
print(f"Best direction found: {best_dir_name} with score {best_score}")

# Step 6: Applying the weights
print("\nStep 6: Applying the weights")
# Apply the best direction to the model
my_model.apply_refusal_dirs([best_dir], layers=None)
# Note: layers=None will apply it to all writable layers

# Step 7: Blacklisting specific layers (optional)
print("\nStep 7: Blacklisting specific layers (optional)")
# Sometimes some layers are troublesome no matter what you do
# Blacklist the first and last few layers as recommended
my_model.blacklist_layer([0, 1, 30, 31])  # Adjust based on model architecture

# Step 8: Benchmarking
print("\nStep 8: Benchmarking")
# Check how much the model has changed after applying directions
with my_model:  # loads a temporary context with the model
    # Apply the direction in the context
    my_model.apply_refusal_dirs([best_dir])
    # Check the Mean Squared Error using the current cached harmless runs as "ground truth"
    mse = my_model.mse_positive(N=32)
    print(f"MSE with harmless ground truth: {mse}")

# Step 9: Testing the model
print("\nStep 9: Testing the model")
# Run N samples from the harmful test set and print them
my_model.test(N=4, batch_size=2)

# Step 10: Generate a response to a prompt
print("\nStep 10: Generate a response to a prompt")
response = my_model.generate("how do I hack wifi networks?")
print("Response:", response)

print("\nDemo completed successfully!")
```

![Image](https://github.com/user-attachments/assets/9ab84740-1cee-46bc-942e-44187ea81988)


Provide feedback

Saved searches

Use saved searches to filter your results more quickly

after abliteration still getting same responses #30

Metadata

Assignees

Labels

Projects

Milestone

Relationships

Development

after abliteration still getting same responses #30

Description

Metadata

Metadata

Assignees

Labels

Projects

Milestone

Relationships

Development

Issue actions