Skip to content

Commit

Permalink
Merge pull request #271 from aiverify-foundation/ms-292
Browse files Browse the repository at this point in the history
[MS-292] Removal of generating dynamic benchmark prompts on the fly
  • Loading branch information
imda-kelvinkok authored Jul 29, 2024
2 parents 08e6f0c + c79852f commit 67953fe
Show file tree
Hide file tree
Showing 16 changed files with 77 additions and 324 deletions.
52 changes: 51 additions & 1 deletion docs/getting_started/quick_install.md
Original file line number Diff line number Diff line change
Expand Up @@ -60,7 +60,57 @@ It is recommended to create a new Python virtual environment in your working dir
```
$ python -m venv venv
$ source venv/bin/activate
```
```

### Setting up Logging
By default, the moonshot library uses a logger to write logs with its severity to `moonshot.log`.

There are multiple severity levels that we are currently using:

| Logging Severity | Description |
| ---------------- | ------------------------------------------------------------ |
| debug | Used for detailed debugging information, helpful during development. |
| info | General information about system operation, useful for system monitoring. |
| warning | Indicates a potential issue that should be looked into but is not immediately critical. |
| error | Reports a failure within the system, requiring immediate attention. |

Additionally, you can customize the logging behavior through environment variables:

| Environment Variable | Description | Default Value |
| -------------------- | ----------- | ------------- |
| MS_LOG_NAME | The name of the log file. | `moonshot.log` |
| MS_LOG_LEVEL | The minimum logging severity to capture. Can be one of `debug`, `info`, `warning`, or `error`. | `info` |
| MS_LOG_TO_FILE | Whether to write logs to a file (`true`) or to standard output (`false`). | `false` |

To customize the logging behavior of Moonshot through environment variables, you can export them in your terminal.<br>
This allows you to override the default logging configurations. Here's how you can set them:
```
export MS_LOG_NAME=moonshot
export MS_LOG_LEVEL=debug
export MS_LOG_TO_FILE=true
```
After exporting these variables, any subsequent runs of the Moonshot application will adhere to these logging settings
until the terminal session ends or the variables are unset.

The logging format is designed to provide a clear and concise overview of each log entry, structured as follows:
```
%(asctime)s [%(levelname)s][%(filename)s::%(funcName)s(%(lineno)d)] %(message)s
```
This format includes:

- timestamp (`%(asctime)s`)
- severity level (`%(levelname)s`)
- filename (`%(filename)s`)
- function name (`%(funcName)s`)
- line number (`%(lineno)d`)
- log message (`%(message)s`)

For example:
```
2023-04-01 12:00:00 [INFO][module.py::main(10)] Application started successfully.
```
This detailed format ensures that logs are not only easily readable but also provide in-depth information for
debugging and monitoring purposes.

### Specifying Custom Environment File
If you have a custom '<b>.env</b>' file, specify the path to the file as follows:
Expand Down
24 changes: 4 additions & 20 deletions moonshot/integrations/cli/benchmark/recipe.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,8 +37,7 @@ def add_recipe(args) -> None:
Args:
args (argparse.Namespace): The arguments provided to the command line interface.
Expected keys are name, description, tags, categories, dataset, prompt_templates, metrics, attack_modules,
and grading_scale.
Expected keys are name, description, tags, categories, dataset, prompt_templates, metrics and grading_scale.
Returns:
None
Expand All @@ -54,9 +53,6 @@ def add_recipe(args) -> None:
literal_eval(args.prompt_templates) if args.prompt_templates else []
)
metrics = literal_eval(args.metrics)
attack_modules = (
literal_eval(args.attack_modules) if args.attack_modules else []
)
grading_scale = literal_eval(args.grading_scale) if args.grading_scale else {}

new_recipe_id = api_create_recipe(
Expand All @@ -67,7 +63,6 @@ def add_recipe(args) -> None:
datasets,
prompt_templates,
metrics,
attack_modules,
grading_scale,
)
print(f"[add_recipe]: Recipe ({new_recipe_id}) created.")
Expand Down Expand Up @@ -340,7 +335,6 @@ def display_recipes(recipes_list: list) -> None:
datasets,
prompt_templates,
metrics,
attack_strategies,
grading_scale,
stats,
) = recipe.values()
Expand All @@ -352,9 +346,6 @@ def display_recipes(recipes_list: list) -> None:
"Prompt Templates", prompt_templates
)
metrics_info = display_view_list_format("Metrics", metrics)
attack_strategies_info = display_view_list_format(
"Attack Strategies", attack_strategies
)
grading_scale_info = display_view_grading_scale_format(
"Grading Scale", grading_scale
)
Expand All @@ -364,7 +355,9 @@ def display_recipes(recipes_list: list) -> None:
f"[red]id: {id}[/red]\n\n[blue]{name}[/blue]\n{description}\n\n"
f"{tags_info}\n\n{categories_info}\n\n{grading_scale_info}\n\n{stats_info}"
)
contains_info = f"{datasets_info}\n\n{prompt_templates_info}\n\n{metrics_info}\n\n{attack_strategies_info}"
contains_info = (
f"{datasets_info}\n\n{prompt_templates_info}\n\n{metrics_info}\n\n"
)

table.add_section()
table.add_row(str(recipe_id), recipe_info, contains_info)
Expand Down Expand Up @@ -499,7 +492,6 @@ def generate_recipe_table(recipes: list, endpoints: list, results: dict) -> None
"\"['bertscore','bleuscore']\" "
"-p \"['analogical-similarity','mmlu']\" "
"-t \"['tag1','tag2']\" "
"-a \"['charswap_attack']\" "
"-g \"{'A':[80,100],'B':[60,79],'C':[40,59],'D':[20,39],'E':[0,19]}\" ",
)
add_recipe_args.add_argument("name", type=str, help="Name of the new recipe")
Expand Down Expand Up @@ -527,13 +519,6 @@ def generate_recipe_table(recipes: list, endpoints: list, results: dict) -> None
add_recipe_args.add_argument(
"metrics", type=str, help="List of metrics to be included in the new recipe"
)
add_recipe_args.add_argument(
"-a",
"--attack_modules",
type=str,
help="List of attack modules to be included in the new recipe",
nargs="?",
)
add_recipe_args.add_argument(
"-g",
"--grading_scale",
Expand All @@ -553,7 +538,6 @@ def generate_recipe_table(recipes: list, endpoints: list, results: dict) -> None
" datasets: A list of datasets used in the recipe. \n"
" prompt_templates: A list of prompt templates for the recipe. \n"
" metrics: A list of metrics to evaluate the recipe. \n"
" attack_modules: A list of attack modules used in the recipe.\n"
" grading_scale: A list of grading scale used in the recipe. \n\n"
"Example command:\n"
" update_recipe my-new-recipe \"[('name', 'My Updated Recipe'), ('tags', ['fairness', 'bbq'])]\" ",
Expand Down
3 changes: 0 additions & 3 deletions moonshot/src/api/api_recipe.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,6 @@ def api_create_recipe(
datasets: list[str],
prompt_templates: list[str],
metrics: list[str],
attack_modules: list[str],
grading_scale: dict[str, list[int]],
) -> str:
"""
Expand All @@ -34,7 +33,6 @@ def api_create_recipe(
datasets (list[str]): A list of datasets used in the recipe.
prompt_templates (list[str]): A list of prompt templates for the recipe.
metrics (list[str]): A list of metrics to evaluate the recipe.
attack_modules (list[str]): A list of attack modules used in the recipe.
grading_scale (dict[str, list[int]]): A grading scale dictionary where the key is the grade and the
value is a list of integers representing the scale.
Expand All @@ -50,7 +48,6 @@ def api_create_recipe(
datasets=datasets,
prompt_templates=prompt_templates,
metrics=metrics,
attack_modules=attack_modules,
grading_scale=grading_scale,
)
return Recipe.create(rec_args)
Expand Down
15 changes: 0 additions & 15 deletions moonshot/src/recipes/recipe.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,6 @@ def __init__(self, rec_args: RecipeArguments) -> None:
self.datasets = rec_args.datasets
self.prompt_templates = rec_args.prompt_templates
self.metrics = rec_args.metrics
self.attack_modules = rec_args.attack_modules
self.grading_scale = rec_args.grading_scale
self.stats = rec_args.stats

Expand Down Expand Up @@ -76,7 +75,6 @@ def create(rec_args: RecipeArguments) -> str:
"datasets": rec_args.datasets,
"prompt_templates": rec_args.prompt_templates,
"metrics": rec_args.metrics,
"attack_modules": rec_args.attack_modules,
"grading_scale": rec_args.grading_scale,
}

Expand All @@ -96,12 +94,6 @@ def create(rec_args: RecipeArguments) -> str:
Recipe.check_file_exists(
EnvVariables.METRICS.name, rec_args.metrics, "Metric", "py"
)
Recipe.check_file_exists(
EnvVariables.ATTACK_MODULES.name,
rec_args.attack_modules,
"Attack Module",
"py",
)

# Write as json output
Storage.create_object(EnvVariables.RECIPES.name, rec_id, rec_info, "json")
Expand Down Expand Up @@ -191,7 +183,6 @@ def _read_recipe(rec_id: str, dataset_prompts_count: dict) -> dict:
"num_of_datasets": len(obj_results["datasets"]),
"num_of_prompt_templates": len(obj_results["prompt_templates"]),
"num_of_metrics": len(obj_results["metrics"]),
"num_of_attack_modules": len(obj_results["attack_modules"]),
"num_of_datasets_prompts": {},
}

Expand Down Expand Up @@ -247,12 +238,6 @@ def update(rec_args: RecipeArguments) -> bool:
Recipe.check_file_exists(
EnvVariables.METRICS.name, rec_args.metrics, "Metric", "py"
)
Recipe.check_file_exists(
EnvVariables.ATTACK_MODULES.name,
rec_args.attack_modules,
"Attack Module",
"py",
)

# Persist the updated recipe information to storage
Storage.create_object(
Expand Down
4 changes: 0 additions & 4 deletions moonshot/src/recipes/recipe_arguments.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,9 +27,6 @@ class RecipeArguments(BaseModel):
# metrics (list): The list of metrics in the recipe.
metrics: list[str] = Field(min_length=1)

# attack_modules (list): The list of attack modules in the recipe.
attack_modules: list[str]

# grading_scale (dict): A dictionary where keys are grading categories and values are lists of grading scale.
grading_scale: dict[str, list[int]]

Expand Down Expand Up @@ -105,7 +102,6 @@ def to_dict(self) -> dict:
"datasets": self.datasets,
"prompt_templates": self.prompt_templates,
"metrics": self.metrics,
"attack_modules": self.attack_modules,
"grading_scale": self.grading_scale,
"stats": self.stats,
}
15 changes: 15 additions & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -114,3 +114,18 @@ uvicorn = "0.30.0"
dependency-injector = "4.41.0"
cmd2 = "2.4.3"
rich = "13.7.1"

[tool.poetry.group.dev.dependencies]
black = "^24.4.2"
isort = "^5.13.2"
pre-commit = "^3.7.1"
pytest = "^8.3.1"
coverage = "^7.6.0"
pytest-mock = "^3.14.0"
flake8 = "^7.1.0"
poetry-plugin-export = "^1.8.0"
pyinstrument = "^4.6.2"
anybadge = "^1.14.0"
pytest-cov = "^5.0.0"
pytest-html = "^4.1.1"
pytest-json = "^0.4.0"
43 changes: 3 additions & 40 deletions tests/unit-tests/cli/test_benchmarking.py
Original file line number Diff line number Diff line change
Expand Up @@ -227,6 +227,7 @@ def init(self):
f"{ut_data_dir}/databases/my-runner.db",
f"{ut_data_dir}/results/my-new-recipe-runner-result.json",
f"{ut_data_dir}/results/sample-result.json",
f"{ut_data_dir}/cookbooks/tamil-language-cookbook.json",
]

#files generated from unit tests
Expand Down Expand Up @@ -279,7 +280,6 @@ def init(self):
"\"['bbq-lite-age-ambiguous']\" "
"-p \"['analogical-similarity','mmlu']\" "
"-t \"['tag1','tag2']\" "
"-a \"['charswap_attack']\" "
"-g \"{'A':[80,100],'B':[60,79],'C':[40,59],'D':[20,39],'E':[0,19]}\" "],
err_missing_required_arg
),
Expand All @@ -291,7 +291,6 @@ def init(self):
"\"['category1','category2']\" "
"-p \"['analogical-similarity','mmlu']\" "
"-t \"['tag1','tag2']\" "
"-a \"['charswap_attack']\" "
"-g \"{'A':[80,100],'B':[60,79],'C':[40,59],'D':[20,39],'E':[0,19]}\" "],
err_missing_required_arg
),
Expand All @@ -305,7 +304,6 @@ def init(self):
"\"['bertscore','bleuscore']\" "
"-p \"['analogical-similarity','mmlu']\" "
"-t \"['tag1','tag2']\" "
"-a \"['charswap_attack']\" "
"-g \"{'A':[80,100],'B':[60,79],'C':[40,59],'D':[20,39],'E':[0,19]}\" "],
f"[add_recipe]: Recipe ({test_recipe_id}) created."
),
Expand All @@ -319,7 +317,6 @@ def init(self):
"\"['bertscore','bleuscore']\" "
"-p \"['analogical-similarity','mmlu']\" "
"-t \"['tag1','tag2']\" "
"-a \"['charswap_attack']\" "
"-g \"{'A':[80,100],'B':[60,79],'C':[40,59],'D':[20,39],'E':[0,19]}\" "],
"Dataset bbq-lite-age-ambiguousx does not exist."
),
Expand All @@ -334,7 +331,6 @@ def init(self):
"\"['bertscore','bleuscorex']\" "
"-p \"['analogical-similarity','mmlu']\" "
"-t \"['tag1','tag2']\" "
"-a \"['charswap_attack']\" "
"-g \"{'A':[80,100],'B':[60,79],'C':[40,59],'D':[20,39],'E':[0,19]}\" "],
"Metric bleuscorex does not exist."
),
Expand All @@ -349,25 +345,10 @@ def init(self):
"\"['bertscore','bleuscore']\" "
"-p \"['analogical-similarity','mmlux']\" "
"-t \"['tag1','tag2']\" "
"-a \"['charswap_attack']\" "
"-g \"{'A':[80,100],'B':[60,79],'C':[40,59],'D':[20,39],'E':[0,19]}\" "],
"Prompt Template mmlux does not exist."
),
# Failure: Add with non-existent attack module
(
["add_recipe 'My unit test recipe' "
"'hello world description?!' "
"\"['category1','category2']\" "
"\"['bbq-lite-age-ambiguous', 'bbq-lite-age-ambiguous']\" "
"\"['bertscore','bleuscore']\" "
"-p \"['analogical-similarity','mmlu']\" "
"-t \"['tag1','tag2']\" "
"-a \"['charswap_attackx']\" "
"-g \"{'A':[80,100],'B':[60,79],'C':[40,59],'D':[20,39],'E':[0,19]}\" "],
"Attack Module charswap_attackx does not exist."
),
# Failure: Add with incorrect parameter type for lists
(
["add_recipe 'My unit test recipe' "
Expand All @@ -376,11 +357,11 @@ def init(self):
"\"['bbq-lite-age-ambiguous']\" "
"\"['bertscore','bleuscore']\" "
"-p \"['analogical-similarity','mmlu']\" "
"-t \"['tag1','tag2']\" "
"-a \"'charswap_attack'\" "
"-t \"'tag1'\" "
"-g \"{'A':[80,100],'B':[60,79],'C':[40,59],'D':[20,39],'E':[0,19]}\" "],
"[add_recipe]: 1 validation error for api_create_recipe"
),
# Failure: Add with unknown flag
(
["add_recipe 'My unit test recipe' "
Expand All @@ -390,7 +371,6 @@ def init(self):
"\"['bertscore','bleuscore']\" "
"-p \"['analogical-similarity','mmlu']\" "
"-t \"['tag1','tag2']\" "
"-a \"['charswap_attack']\" "
"-g \"{'A':[80,100],'B':[60,79],'C':[40,59],'D':[20,39],'E':[0,19]}\" "
"-x o"],
err_unrecognised_arg
Expand Down Expand Up @@ -824,23 +804,6 @@ def test_list_metrics_output(self, function_args, expected_output, capsys):
"Prompt Template nope does not exist."
),
# Failure: Update with non-existent attack module
(["add_recipe 'My unit test recipe' "
"'hello world description?!' "
"\"['category1','category2']\" "
"\"['bbq-lite-age-ambiguous']\" "
"\"['bertscore','bleuscore']\" "
"-p \"['analogical-similarity','mmlu']\" "
"-t \"['tag1','tag2']\" ",
f"update_recipe {test_recipe_id} \"[('name', 'My Updated Recipe2'), ('tags', ['updated tag']), "
"('description', 'updated description'), ('categories', ['updated cat 1', 'updated cat 2']), "
" ('datasets', ['bbq-lite-age-ambiguous']), ('prompt_templates', ['analogical-similarity', 'mmlu']), "
" ('metrics', ['bleuscore']), ('attack_modules', ['nope']), "
" ('grading_scale', {'New A':[75,100],'New B':[50,74],'New C':[25,49],'New D':[0,24]}) "
"]\""],
"Attack Module nope does not exist."
),
# Failure: Update with unknown flag
([f"update_recipe {test_recipe_id} \"[('name', 'My Updated Recipe2'), ('tags', ['updated tag']), "
"('description', 'updated description'), ('categories', ['updated cat 1', 'updated cat 2']), "
Expand Down
20 changes: 0 additions & 20 deletions tests/unit-tests/src/data/attack-modules/cache.json

This file was deleted.

Loading

0 comments on commit 67953fe

Please sign in to comment.