FEAT: add DarkBench dataset (#821)

paulinek13 · web-flow · commit 5ee1def6ed25 · 2025-03-23T18:10:40.000-04:00
diff --git a/doc/api.rst b/doc/api.rst
@@ -114,6 +114,7 @@ API Reference
     fetch_adv_bench_dataset
     fetch_aya_redteaming_dataset
     fetch_babelscape_alert_dataset
+    fetch_darkbench_dataset
     fetch_decoding_trust_stereotypes_dataset
     fetch_examples
     fetch_forbidden_questions_dataset
diff --git a/pyrit/datasets/__init__.py b/pyrit/datasets/__init__.py
@@ -4,6 +4,7 @@
 from pyrit.datasets.adv_bench_dataset import fetch_adv_bench_dataset
 from pyrit.datasets.aya_redteaming_dataset import fetch_aya_redteaming_dataset
 from pyrit.datasets.babelscape_alert_dataset import fetch_babelscape_alert_dataset
+from pyrit.datasets.darkbench_dataset import fetch_darkbench_dataset
 from pyrit.datasets.decoding_trust_stereotypes_dataset import fetch_decoding_trust_stereotypes_dataset
 from pyrit.datasets.dataset_helper import fetch_examples
 from pyrit.datasets.forbidden_questions_dataset import fetch_forbidden_questions_dataset
@@ -24,6 +25,7 @@
     "fetch_adv_bench_dataset",
     "fetch_aya_redteaming_dataset",
     "fetch_babelscape_alert_dataset",
+    "fetch_darkbench_dataset",
     "fetch_decoding_trust_stereotypes_dataset",
     "fetch_examples",
     "fetch_forbidden_questions_dataset",
diff --git a/pyrit/datasets/aya_redteaming_dataset.py b/pyrit/datasets/aya_redteaming_dataset.py
@@ -1,6 +1,7 @@
 # Copyright (c) Microsoft Corporation.
 # Licensed under the MIT license.
 
+import ast
 from pathlib import Path
 from typing import List, Literal, Optional
 
@@ -77,7 +78,7 @@ def fetch_aya_redteaming_dataset(
     seed_prompts = []
 
     for example in examples:
-        categories = eval(example["harm_category"])
+        categories = ast.literal_eval(example["harm_category"])
         if harm_categories is None or any(cat in categories for cat in harm_categories):
             if harm_scope is None or example["global_or_local"] == harm_scope:
                 seed_prompts.append(
diff --git a/pyrit/datasets/darkbench_dataset.py b/pyrit/datasets/darkbench_dataset.py
@@ -0,0 +1,37 @@
+# Copyright (c) Microsoft Corporation.
+# Licensed under the MIT license.
+
+from datasets import load_dataset
+
+from pyrit.models import SeedPromptDataset
+from pyrit.models.seed_prompt import SeedPrompt
+
+
+def fetch_darkbench_dataset() -> SeedPromptDataset:
+    """
+    Fetch DarkBench examples and create a SeedPromptDataset.
+
+    Returns:
+        SeedPromptDataset: A SeedPromptDataset containing the examples.
+    """
+    data = load_dataset("anonymous152311/darkbench", "default")
+
+    seed_prompts = [
+        SeedPrompt(
+            value=item["Example"],
+            data_type="text",
+            name="",
+            dataset_name="DarkBench",
+            harm_categories=[item["Deceptive Pattern"]],
+            description="""The DarkBench dataset focuses on dark patterns and is available on Hugging Face,
+                created by anonymous152311 (https://huggingface.co/anonymous152311). The dataset includes
+                660 examples, each labeled with a 'Deceptive Pattern' category. These categories indicate
+                different types of deceptive strategies used in the data, such as:
+                Anthropomorphization, Brand bias, Harmful generation, Sneaking, Sycophancy, or User retention.""",
+            source="https://huggingface.co/datasets/anonymous152311/darkbench",
+        )
+        for item in data["train"]
+    ]
+
+    seed_prompt_dataset = SeedPromptDataset(prompts=seed_prompts)
+    return seed_prompt_dataset
diff --git a/tests/integration/datasets/test_fetch_datasets.py b/tests/integration/datasets/test_fetch_datasets.py
@@ -7,6 +7,7 @@
     fetch_adv_bench_dataset,
     fetch_aya_redteaming_dataset,
     fetch_babelscape_alert_dataset,
+    fetch_darkbench_dataset,
     fetch_decoding_trust_stereotypes_dataset,
     fetch_forbidden_questions_dataset,
     fetch_harmbench_dataset,
@@ -29,6 +30,7 @@
         (fetch_adv_bench_dataset, True),
         (fetch_aya_redteaming_dataset, True),
         (fetch_babelscape_alert_dataset, True),
+        (fetch_darkbench_dataset, True),
         (fetch_decoding_trust_stereotypes_dataset, True),
         (fetch_forbidden_questions_dataset, True),
         (fetch_harmbench_dataset, True),