Skip to content

Commit 5ee1def

Browse files
authored
FEAT: add DarkBench dataset (#821)
1 parent e4e1512 commit 5ee1def

File tree

5 files changed

+44
-1
lines changed

5 files changed

+44
-1
lines changed

doc/api.rst

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -114,6 +114,7 @@ API Reference
114114
fetch_adv_bench_dataset
115115
fetch_aya_redteaming_dataset
116116
fetch_babelscape_alert_dataset
117+
fetch_darkbench_dataset
117118
fetch_decoding_trust_stereotypes_dataset
118119
fetch_examples
119120
fetch_forbidden_questions_dataset

pyrit/datasets/__init__.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@
44
from pyrit.datasets.adv_bench_dataset import fetch_adv_bench_dataset
55
from pyrit.datasets.aya_redteaming_dataset import fetch_aya_redteaming_dataset
66
from pyrit.datasets.babelscape_alert_dataset import fetch_babelscape_alert_dataset
7+
from pyrit.datasets.darkbench_dataset import fetch_darkbench_dataset
78
from pyrit.datasets.decoding_trust_stereotypes_dataset import fetch_decoding_trust_stereotypes_dataset
89
from pyrit.datasets.dataset_helper import fetch_examples
910
from pyrit.datasets.forbidden_questions_dataset import fetch_forbidden_questions_dataset
@@ -24,6 +25,7 @@
2425
"fetch_adv_bench_dataset",
2526
"fetch_aya_redteaming_dataset",
2627
"fetch_babelscape_alert_dataset",
28+
"fetch_darkbench_dataset",
2729
"fetch_decoding_trust_stereotypes_dataset",
2830
"fetch_examples",
2931
"fetch_forbidden_questions_dataset",

pyrit/datasets/aya_redteaming_dataset.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
# Copyright (c) Microsoft Corporation.
22
# Licensed under the MIT license.
33

4+
import ast
45
from pathlib import Path
56
from typing import List, Literal, Optional
67

@@ -77,7 +78,7 @@ def fetch_aya_redteaming_dataset(
7778
seed_prompts = []
7879

7980
for example in examples:
80-
categories = eval(example["harm_category"])
81+
categories = ast.literal_eval(example["harm_category"])
8182
if harm_categories is None or any(cat in categories for cat in harm_categories):
8283
if harm_scope is None or example["global_or_local"] == harm_scope:
8384
seed_prompts.append(
Lines changed: 37 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,37 @@
1+
# Copyright (c) Microsoft Corporation.
2+
# Licensed under the MIT license.
3+
4+
from datasets import load_dataset
5+
6+
from pyrit.models import SeedPromptDataset
7+
from pyrit.models.seed_prompt import SeedPrompt
8+
9+
10+
def fetch_darkbench_dataset() -> SeedPromptDataset:
11+
"""
12+
Fetch DarkBench examples and create a SeedPromptDataset.
13+
14+
Returns:
15+
SeedPromptDataset: A SeedPromptDataset containing the examples.
16+
"""
17+
data = load_dataset("anonymous152311/darkbench", "default")
18+
19+
seed_prompts = [
20+
SeedPrompt(
21+
value=item["Example"],
22+
data_type="text",
23+
name="",
24+
dataset_name="DarkBench",
25+
harm_categories=[item["Deceptive Pattern"]],
26+
description="""The DarkBench dataset focuses on dark patterns and is available on Hugging Face,
27+
created by anonymous152311 (https://huggingface.co/anonymous152311). The dataset includes
28+
660 examples, each labeled with a 'Deceptive Pattern' category. These categories indicate
29+
different types of deceptive strategies used in the data, such as:
30+
Anthropomorphization, Brand bias, Harmful generation, Sneaking, Sycophancy, or User retention.""",
31+
source="https://huggingface.co/datasets/anonymous152311/darkbench",
32+
)
33+
for item in data["train"]
34+
]
35+
36+
seed_prompt_dataset = SeedPromptDataset(prompts=seed_prompts)
37+
return seed_prompt_dataset

tests/integration/datasets/test_fetch_datasets.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,7 @@
77
fetch_adv_bench_dataset,
88
fetch_aya_redteaming_dataset,
99
fetch_babelscape_alert_dataset,
10+
fetch_darkbench_dataset,
1011
fetch_decoding_trust_stereotypes_dataset,
1112
fetch_forbidden_questions_dataset,
1213
fetch_harmbench_dataset,
@@ -29,6 +30,7 @@
2930
(fetch_adv_bench_dataset, True),
3031
(fetch_aya_redteaming_dataset, True),
3132
(fetch_babelscape_alert_dataset, True),
33+
(fetch_darkbench_dataset, True),
3234
(fetch_decoding_trust_stereotypes_dataset, True),
3335
(fetch_forbidden_questions_dataset, True),
3436
(fetch_harmbench_dataset, True),

0 commit comments

Comments
 (0)