Merge pull request #271 from aiverify-foundation/ms-292

[MS-292] Removal of generating dynamic benchmark prompts on the fly
aiverify-foundation · Jul 29, 2024 · 67953fe · 67953fe
2 parents 08e6f0c + c79852f
commit 67953fe
Show file tree

Hide file tree

Showing 16 changed files with 77 additions and 324 deletions.
diff --git a/docs/getting_started/quick_install.md b/docs/getting_started/quick_install.md
@@ -60,7 +60,57 @@ It is recommended to create a new Python virtual environment in your working dir
     ```
     $ python -m venv venv
     $ source venv/bin/activate
-    ```    
+    ```
+
+### Setting up Logging
+By default, the moonshot library uses a logger to write logs with its severity to `moonshot.log`.
+
+There are multiple severity levels that we are currently using:
+
+| Logging Severity | Description                                                  |
+| ---------------- | ------------------------------------------------------------ |
+| debug            | Used for detailed debugging information, helpful during development. |
+| info             | General information about system operation, useful for system monitoring. |
+| warning          | Indicates a potential issue that should be looked into but is not immediately critical. |
+| error            | Reports a failure within the system, requiring immediate attention. |
+
+Additionally, you can customize the logging behavior through environment variables:
+
+| Environment Variable | Description | Default Value |
+| -------------------- | ----------- | ------------- |
+| MS_LOG_NAME          | The name of the log file. | `moonshot.log` |
+| MS_LOG_LEVEL         | The minimum logging severity to capture. Can be one of `debug`, `info`, `warning`, or `error`. | `info` |
+| MS_LOG_TO_FILE       | Whether to write logs to a file (`true`) or to standard output (`false`). | `false` |
+
+To customize the logging behavior of Moonshot through environment variables, you can export them in your terminal.<br>
+This allows you to override the default logging configurations. Here's how you can set them:
+```
+export MS_LOG_NAME=moonshot
+export MS_LOG_LEVEL=debug
+export MS_LOG_TO_FILE=true
+```
+After exporting these variables, any subsequent runs of the Moonshot application will adhere to these logging settings 
+until the terminal session ends or the variables are unset.
+
+The logging format is designed to provide a clear and concise overview of each log entry, structured as follows:
+```
+%(asctime)s [%(levelname)s][%(filename)s::%(funcName)s(%(lineno)d)] %(message)s
+```
+This format includes:
+
+- timestamp (`%(asctime)s`)
+- severity level (`%(levelname)s`)
+- filename (`%(filename)s`)
+- function name (`%(funcName)s`)
+- line number (`%(lineno)d`)
+- log message (`%(message)s`)
+
+For example:
+```
+2023-04-01 12:00:00 [INFO][module.py::main(10)] Application started successfully.
+```
+This detailed format ensures that logs are not only easily readable but also provide in-depth information for
+debugging and monitoring purposes.
 
 ### Specifying Custom Environment File
 If you have a custom '<b>.env</b>' file, specify the path to the file as follows:

diff --git a/moonshot/integrations/cli/benchmark/recipe.py b/moonshot/integrations/cli/benchmark/recipe.py
@@ -37,8 +37,7 @@ def add_recipe(args) -> None:
 
     Args:
         args (argparse.Namespace): The arguments provided to the command line interface.
-        Expected keys are name, description, tags, categories, dataset, prompt_templates, metrics, attack_modules,
-        and grading_scale.
+        Expected keys are name, description, tags, categories, dataset, prompt_templates, metrics and grading_scale.
 
     Returns:
         None
@@ -54,9 +53,6 @@ def add_recipe(args) -> None:
             literal_eval(args.prompt_templates) if args.prompt_templates else []
         )
         metrics = literal_eval(args.metrics)
-        attack_modules = (
-            literal_eval(args.attack_modules) if args.attack_modules else []
-        )
         grading_scale = literal_eval(args.grading_scale) if args.grading_scale else {}
 
         new_recipe_id = api_create_recipe(
@@ -67,7 +63,6 @@ def add_recipe(args) -> None:
             datasets,
             prompt_templates,
             metrics,
-            attack_modules,
             grading_scale,
         )
         print(f"[add_recipe]: Recipe ({new_recipe_id}) created.")
@@ -340,7 +335,6 @@ def display_recipes(recipes_list: list) -> None:
                 datasets,
                 prompt_templates,
                 metrics,
-                attack_strategies,
                 grading_scale,
                 stats,
             ) = recipe.values()
@@ -352,9 +346,6 @@ def display_recipes(recipes_list: list) -> None:
                 "Prompt Templates", prompt_templates
             )
             metrics_info = display_view_list_format("Metrics", metrics)
-            attack_strategies_info = display_view_list_format(
-                "Attack Strategies", attack_strategies
-            )
             grading_scale_info = display_view_grading_scale_format(
                 "Grading Scale", grading_scale
             )
@@ -364,7 +355,9 @@ def display_recipes(recipes_list: list) -> None:
                 f"[red]id: {id}[/red]\n\n[blue]{name}[/blue]\n{description}\n\n"
                 f"{tags_info}\n\n{categories_info}\n\n{grading_scale_info}\n\n{stats_info}"
             )
-            contains_info = f"{datasets_info}\n\n{prompt_templates_info}\n\n{metrics_info}\n\n{attack_strategies_info}"
+            contains_info = (
+                f"{datasets_info}\n\n{prompt_templates_info}\n\n{metrics_info}\n\n"
+            )
 
             table.add_section()
             table.add_row(str(recipe_id), recipe_info, contains_info)
@@ -499,7 +492,6 @@ def generate_recipe_table(recipes: list, endpoints: list, results: dict) -> None
     "\"['bertscore','bleuscore']\" "
     "-p \"['analogical-similarity','mmlu']\" "
     "-t \"['tag1','tag2']\" "
-    "-a \"['charswap_attack']\" "
     "-g \"{'A':[80,100],'B':[60,79],'C':[40,59],'D':[20,39],'E':[0,19]}\" ",
 )
 add_recipe_args.add_argument("name", type=str, help="Name of the new recipe")
@@ -527,13 +519,6 @@ def generate_recipe_table(recipes: list, endpoints: list, results: dict) -> None
 add_recipe_args.add_argument(
     "metrics", type=str, help="List of metrics to be included in the new recipe"
 )
-add_recipe_args.add_argument(
-    "-a",
-    "--attack_modules",
-    type=str,
-    help="List of attack modules to be included in the new recipe",
-    nargs="?",
-)
 add_recipe_args.add_argument(
     "-g",
     "--grading_scale",
@@ -553,7 +538,6 @@ def generate_recipe_table(recipes: list, endpoints: list, results: dict) -> None
     "  datasets: A list of datasets used in the recipe. \n"
     "  prompt_templates: A list of prompt templates for the recipe. \n"
     "  metrics: A list of metrics to evaluate the recipe. \n"
-    "  attack_modules: A list of attack modules used in the recipe.\n"
     "  grading_scale: A list of grading scale used in the recipe. \n\n"
     "Example command:\n"
     "  update_recipe my-new-recipe \"[('name', 'My Updated Recipe'), ('tags', ['fairness', 'bbq'])]\" ",

diff --git a/moonshot/src/api/api_recipe.py b/moonshot/src/api/api_recipe.py
@@ -16,7 +16,6 @@ def api_create_recipe(
     datasets: list[str],
     prompt_templates: list[str],
     metrics: list[str],
-    attack_modules: list[str],
     grading_scale: dict[str, list[int]],
 ) -> str:
     """
@@ -34,7 +33,6 @@ def api_create_recipe(
         datasets (list[str]): A list of datasets used in the recipe.
         prompt_templates (list[str]): A list of prompt templates for the recipe.
         metrics (list[str]): A list of metrics to evaluate the recipe.
-        attack_modules (list[str]): A list of attack modules used in the recipe.
         grading_scale (dict[str, list[int]]): A grading scale dictionary where the key is the grade and the
         value is a list of integers representing the scale.
 
@@ -50,7 +48,6 @@ def api_create_recipe(
         datasets=datasets,
         prompt_templates=prompt_templates,
         metrics=metrics,
-        attack_modules=attack_modules,
         grading_scale=grading_scale,
     )
     return Recipe.create(rec_args)

diff --git a/moonshot/src/recipes/recipe.py b/moonshot/src/recipes/recipe.py
@@ -25,7 +25,6 @@ def __init__(self, rec_args: RecipeArguments) -> None:
         self.datasets = rec_args.datasets
         self.prompt_templates = rec_args.prompt_templates
         self.metrics = rec_args.metrics
-        self.attack_modules = rec_args.attack_modules
         self.grading_scale = rec_args.grading_scale
         self.stats = rec_args.stats
 
@@ -76,7 +75,6 @@ def create(rec_args: RecipeArguments) -> str:
                 "datasets": rec_args.datasets,
                 "prompt_templates": rec_args.prompt_templates,
                 "metrics": rec_args.metrics,
-                "attack_modules": rec_args.attack_modules,
                 "grading_scale": rec_args.grading_scale,
             }
 
@@ -96,12 +94,6 @@ def create(rec_args: RecipeArguments) -> str:
             Recipe.check_file_exists(
                 EnvVariables.METRICS.name, rec_args.metrics, "Metric", "py"
             )
-            Recipe.check_file_exists(
-                EnvVariables.ATTACK_MODULES.name,
-                rec_args.attack_modules,
-                "Attack Module",
-                "py",
-            )
 
             # Write as json output
             Storage.create_object(EnvVariables.RECIPES.name, rec_id, rec_info, "json")
@@ -191,7 +183,6 @@ def _read_recipe(rec_id: str, dataset_prompts_count: dict) -> dict:
             "num_of_datasets": len(obj_results["datasets"]),
             "num_of_prompt_templates": len(obj_results["prompt_templates"]),
             "num_of_metrics": len(obj_results["metrics"]),
-            "num_of_attack_modules": len(obj_results["attack_modules"]),
             "num_of_datasets_prompts": {},
         }
 
@@ -247,12 +238,6 @@ def update(rec_args: RecipeArguments) -> bool:
             Recipe.check_file_exists(
                 EnvVariables.METRICS.name, rec_args.metrics, "Metric", "py"
             )
-            Recipe.check_file_exists(
-                EnvVariables.ATTACK_MODULES.name,
-                rec_args.attack_modules,
-                "Attack Module",
-                "py",
-            )
 
             # Persist the updated recipe information to storage
             Storage.create_object(

diff --git a/moonshot/src/recipes/recipe_arguments.py b/moonshot/src/recipes/recipe_arguments.py
@@ -27,9 +27,6 @@ class RecipeArguments(BaseModel):
     # metrics (list): The list of metrics in the recipe.
     metrics: list[str] = Field(min_length=1)
 
-    # attack_modules (list): The list of attack modules in the recipe.
-    attack_modules: list[str]
-
     # grading_scale (dict): A dictionary where keys are grading categories and values are lists of grading scale.
     grading_scale: dict[str, list[int]]
 
@@ -105,7 +102,6 @@ def to_dict(self) -> dict:
             "datasets": self.datasets,
             "prompt_templates": self.prompt_templates,
             "metrics": self.metrics,
-            "attack_modules": self.attack_modules,
             "grading_scale": self.grading_scale,
             "stats": self.stats,
         }
diff --git a/pyproject.toml b/pyproject.toml
@@ -114,3 +114,18 @@ uvicorn = "0.30.0"
 dependency-injector = "4.41.0"
 cmd2 = "2.4.3"
 rich = "13.7.1"
+
+[tool.poetry.group.dev.dependencies]
+black = "^24.4.2"
+isort = "^5.13.2"
+pre-commit = "^3.7.1"
+pytest = "^8.3.1"
+coverage = "^7.6.0"
+pytest-mock = "^3.14.0"
+flake8 = "^7.1.0"
+poetry-plugin-export = "^1.8.0"
+pyinstrument = "^4.6.2"
+anybadge = "^1.14.0"
+pytest-cov = "^5.0.0"
+pytest-html = "^4.1.1"
+pytest-json = "^0.4.0"
diff --git a/tests/unit-tests/cli/test_benchmarking.py b/tests/unit-tests/cli/test_benchmarking.py
@@ -227,6 +227,7 @@ def init(self):
             f"{ut_data_dir}/databases/my-runner.db",
             f"{ut_data_dir}/results/my-new-recipe-runner-result.json",
             f"{ut_data_dir}/results/sample-result.json",
+            f"{ut_data_dir}/cookbooks/tamil-language-cookbook.json",
         ]
 
         #files generated from unit tests
@@ -279,7 +280,6 @@ def init(self):
                 "\"['bbq-lite-age-ambiguous']\" "
                 "-p \"['analogical-similarity','mmlu']\" "
                 "-t \"['tag1','tag2']\" "
-                "-a \"['charswap_attack']\" "
                 "-g \"{'A':[80,100],'B':[60,79],'C':[40,59],'D':[20,39],'E':[0,19]}\" "],
                 err_missing_required_arg
             ),
@@ -291,7 +291,6 @@ def init(self):
                 "\"['category1','category2']\" "
                 "-p \"['analogical-similarity','mmlu']\" "
                 "-t \"['tag1','tag2']\" "
-                "-a \"['charswap_attack']\" "
                 "-g \"{'A':[80,100],'B':[60,79],'C':[40,59],'D':[20,39],'E':[0,19]}\" "],
                 err_missing_required_arg
             ),
@@ -305,7 +304,6 @@ def init(self):
                 "\"['bertscore','bleuscore']\" " 
                 "-p \"['analogical-similarity','mmlu']\" "
                 "-t \"['tag1','tag2']\" "
-                "-a \"['charswap_attack']\" "
                 "-g \"{'A':[80,100],'B':[60,79],'C':[40,59],'D':[20,39],'E':[0,19]}\" "],
                 f"[add_recipe]: Recipe ({test_recipe_id}) created."
             ),
@@ -319,7 +317,6 @@ def init(self):
                 "\"['bertscore','bleuscore']\" " 
                 "-p \"['analogical-similarity','mmlu']\" "
                 "-t \"['tag1','tag2']\" "
-                "-a \"['charswap_attack']\" "
                 "-g \"{'A':[80,100],'B':[60,79],'C':[40,59],'D':[20,39],'E':[0,19]}\" "],
                 "Dataset bbq-lite-age-ambiguousx does not exist."
             ),
@@ -334,7 +331,6 @@ def init(self):
                 "\"['bertscore','bleuscorex']\" " 
                 "-p \"['analogical-similarity','mmlu']\" "
                 "-t \"['tag1','tag2']\" "
-                "-a \"['charswap_attack']\" "
                 "-g \"{'A':[80,100],'B':[60,79],'C':[40,59],'D':[20,39],'E':[0,19]}\" "],
                 "Metric bleuscorex does not exist."
             ),
@@ -349,25 +345,10 @@ def init(self):
                 "\"['bertscore','bleuscore']\" " 
                 "-p \"['analogical-similarity','mmlux']\" "
                 "-t \"['tag1','tag2']\" "
-                "-a \"['charswap_attack']\" "
                 "-g \"{'A':[80,100],'B':[60,79],'C':[40,59],'D':[20,39],'E':[0,19]}\" "],
                 "Prompt Template mmlux does not exist."
             ),
 
-            # Failure: Add with non-existent attack module
-            (
-                ["add_recipe 'My unit test recipe' "
-                "'hello world description?!' "
-                "\"['category1','category2']\" "
-                "\"['bbq-lite-age-ambiguous', 'bbq-lite-age-ambiguous']\" "
-                "\"['bertscore','bleuscore']\" " 
-                "-p \"['analogical-similarity','mmlu']\" "
-                "-t \"['tag1','tag2']\" "
-                "-a \"['charswap_attackx']\" "
-                "-g \"{'A':[80,100],'B':[60,79],'C':[40,59],'D':[20,39],'E':[0,19]}\" "],
-                "Attack Module charswap_attackx does not exist."
-            ),
-
             # Failure: Add with incorrect parameter type for lists
             (
                 ["add_recipe 'My unit test recipe' "
@@ -376,11 +357,11 @@ def init(self):
                 "\"['bbq-lite-age-ambiguous']\" "
                 "\"['bertscore','bleuscore']\" " 
                 "-p \"['analogical-similarity','mmlu']\" "
-                "-t \"['tag1','tag2']\" "
-                "-a \"'charswap_attack'\" "
+                "-t \"'tag1'\" "
                 "-g \"{'A':[80,100],'B':[60,79],'C':[40,59],'D':[20,39],'E':[0,19]}\" "],
                 "[add_recipe]: 1 validation error for api_create_recipe"              
             ),
+
             # Failure: Add with unknown flag           
             (
                 ["add_recipe 'My unit test recipe' "
@@ -390,7 +371,6 @@ def init(self):
                 "\"['bertscore','bleuscore']\" " 
                 "-p \"['analogical-similarity','mmlu']\" "
                 "-t \"['tag1','tag2']\" "
-                "-a \"['charswap_attack']\" "
                 "-g \"{'A':[80,100],'B':[60,79],'C':[40,59],'D':[20,39],'E':[0,19]}\" "
                 "-x o"],
                 err_unrecognised_arg
@@ -824,23 +804,6 @@ def test_list_metrics_output(self, function_args, expected_output, capsys):
              "Prompt Template nope does not exist."
             ),       
 
-            # Failure: Update with non-existent attack module
-            (["add_recipe 'My unit test recipe' "
-                "'hello world description?!' "
-                "\"['category1','category2']\" "
-                "\"['bbq-lite-age-ambiguous']\" "
-                "\"['bertscore','bleuscore']\" " 
-                "-p \"['analogical-similarity','mmlu']\" "
-                "-t \"['tag1','tag2']\" ",
-            f"update_recipe {test_recipe_id} \"[('name', 'My Updated Recipe2'), ('tags', ['updated tag']), "
-             "('description', 'updated description'), ('categories', ['updated cat 1', 'updated cat 2']), "
-             " ('datasets', ['bbq-lite-age-ambiguous']), ('prompt_templates', ['analogical-similarity', 'mmlu']), "
-             " ('metrics', ['bleuscore']), ('attack_modules', ['nope']), "
-             " ('grading_scale', {'New A':[75,100],'New B':[50,74],'New C':[25,49],'New D':[0,24]}) "
-             "]\""],
-             "Attack Module nope does not exist."
-            ),       
-
             # Failure: Update with unknown flag
             ([f"update_recipe {test_recipe_id} \"[('name', 'My Updated Recipe2'), ('tags', ['updated tag']), "
              "('description', 'updated description'), ('categories', ['updated cat 1', 'updated cat 2']), "

diff --git a/tests/unit-tests/src/data/attack-modules/cache.json b/tests/unit-tests/src/data/attack-modules/cache.json