diff --git a/moonshot/integrations/cli/benchmark/cookbook.py b/moonshot/integrations/cli/benchmark/cookbook.py index 0df1a679..38b2f134 100644 --- a/moonshot/integrations/cli/benchmark/cookbook.py +++ b/moonshot/integrations/cli/benchmark/cookbook.py @@ -19,8 +19,33 @@ api_update_cookbook, ) from moonshot.integrations.cli.benchmark.recipe import ( - display_view_grading_scale_format, - display_view_statistics_format, + _display_view_grading_scale_format, + _display_view_statistics_format, +) +from moonshot.integrations.cli.cli_errors import ( + ERROR_BENCHMARK_ADD_COOKBOOK_DESC_VALIDATION, + ERROR_BENCHMARK_ADD_COOKBOOK_NAME_VALIDATION, + ERROR_BENCHMARK_ADD_COOKBOOK_RECIPES_LIST_STR_VALIDATION, + ERROR_BENCHMARK_ADD_COOKBOOK_RECIPES_VALIDATION, + ERROR_BENCHMARK_DELETE_COOKBOOK_COOKBOOK_VALIDATION, + ERROR_BENCHMARK_LIST_COOKBOOK_FIND_VALIDATION, + ERROR_BENCHMARK_LIST_COOKBOOK_PAGINATION_VALIDATION, + ERROR_BENCHMARK_LIST_COOKBOOK_PAGINATION_VALIDATION_1, + ERROR_BENCHMARK_RUN_COOKBOOK_COOKBOOKS_VALIDATION, + ERROR_BENCHMARK_RUN_COOKBOOK_COOKBOOKS_VALIDATION_1, + ERROR_BENCHMARK_RUN_COOKBOOK_ENDPOINTS_VALIDATION, + ERROR_BENCHMARK_RUN_COOKBOOK_ENDPOINTS_VALIDATION_1, + ERROR_BENCHMARK_RUN_COOKBOOK_NAME_VALIDATION, + ERROR_BENCHMARK_RUN_COOKBOOK_NO_RESULT, + ERROR_BENCHMARK_RUN_COOKBOOK_NUM_OF_PROMPTS_VALIDATION, + ERROR_BENCHMARK_RUN_COOKBOOK_RANDOM_SEED_VALIDATION, + ERROR_BENCHMARK_RUN_COOKBOOK_RESULT_PROC_MOD_VALIDATION, + ERROR_BENCHMARK_RUN_COOKBOOK_RUNNER_PROC_MOD_VALIDATION, + ERROR_BENCHMARK_RUN_COOKBOOK_SYS_PROMPT_VALIDATION, + ERROR_BENCHMARK_UPDATE_COOKBOOK_COOKBOOK_VALIDATION, + ERROR_BENCHMARK_UPDATE_COOKBOOK_UPDATE_VALUES_VALIDATION, + ERROR_BENCHMARK_UPDATE_COOKBOOK_UPDATE_VALUES_VALIDATION_1, + ERROR_BENCHMARK_VIEW_COOKBOOK_COOKBOOK_VALIDATION, ) from moonshot.integrations.cli.common.display_helper import display_view_list_format from moonshot.integrations.cli.utils.process_data import filter_data @@ -45,11 +70,38 @@ def add_cookbook(args) -> None: description (str): The description of the cookbook. recipes (str): A string representation of a list of recipes. Each recipe is represented by its ID. + Raises: + TypeError: If the 'name', 'description', or 'recipes' arguments are not strings or are None. + ValueError: If the 'recipes' argument is not a list after evaluation. + Returns: None """ try: + if not isinstance(args.name, str) or not args.name or args.name is None: + raise TypeError(ERROR_BENCHMARK_ADD_COOKBOOK_NAME_VALIDATION) + + if ( + not isinstance(args.description, str) + or not args.description + or args.description is None + ): + raise TypeError(ERROR_BENCHMARK_ADD_COOKBOOK_DESC_VALIDATION) + + if ( + not isinstance(args.recipes, str) + or not args.recipes + or args.recipes is None + ): + raise TypeError(ERROR_BENCHMARK_ADD_COOKBOOK_RECIPES_VALIDATION) + recipes = literal_eval(args.recipes) + if not ( + isinstance(recipes, list) + and all(isinstance(recipe, str) for recipe in recipes) + ): + raise ValueError(ERROR_BENCHMARK_ADD_COOKBOOK_RECIPES_LIST_STR_VALIDATION) + new_cookbook_id = api_create_cookbook(args.name, args.description, recipes) print(f"[add_cookbook]: Cookbook ({new_cookbook_id}) created.") except Exception as e: @@ -61,22 +113,48 @@ def list_cookbooks(args) -> list | None: List all available cookbooks. This function retrieves all available cookbooks by calling the api_get_all_cookbook function from the - moonshot.api module. - It then displays the retrieved cookbooks using the _display_cookbooks function. + moonshot.api module. It then filters the retrieved cookbooks based on the provided 'find' keyword and + 'pagination' parameters, and displays the filtered cookbooks using the _display_cookbooks function. Args: - args: A namespace object from argparse. It should have an optional attribute: - find (str): Optional field to find cookbook(s) with a keyword. - pagination (str): Optional field to paginate cookbooks. + args: A namespace object from argparse. It should have the following optional attributes: + find (str): Optional field to find cookbook(s) with a keyword. + pagination (str): Optional field to paginate cookbooks. It should be a string representation of a tuple + containing two integers (page number and page size). + + Raises: + TypeError: If the 'find' or 'pagination' arguments are not strings or are None. + ValueError: If the 'pagination' argument is not a tuple of two integers after evaluation. Returns: - list | None: A list of Cookbook or None if there is no result. + list | None: A list of filtered cookbooks or None if there is no result. """ try: + if args.find is not None: + if not isinstance(args.find, str) or not args.find: + raise TypeError(ERROR_BENCHMARK_LIST_COOKBOOK_FIND_VALIDATION) + + if args.pagination is not None: + if not isinstance(args.pagination, str) or not args.pagination: + raise TypeError(ERROR_BENCHMARK_LIST_COOKBOOK_PAGINATION_VALIDATION) + try: + pagination = literal_eval(args.pagination) + if not ( + isinstance(pagination, tuple) + and len(pagination) == 2 + and all(isinstance(i, int) for i in pagination) + ): + raise ValueError( + ERROR_BENCHMARK_LIST_COOKBOOK_PAGINATION_VALIDATION_1 + ) + except (ValueError, SyntaxError): + raise ValueError(ERROR_BENCHMARK_LIST_COOKBOOK_PAGINATION_VALIDATION_1) + else: + pagination = () + cookbooks_list = api_get_all_cookbook() keyword = args.find.lower() if args.find else "" - pagination = literal_eval(args.pagination) if args.pagination else () if cookbooks_list: filtered_cookbooks_list = filter_data(cookbooks_list, keyword, pagination) @@ -89,6 +167,7 @@ def list_cookbooks(args) -> list | None: except Exception as e: print(f"[list_cookbooks]: {str(e)}") + return None def view_cookbook(args) -> None: @@ -96,19 +175,30 @@ def view_cookbook(args) -> None: View a specific cookbook. This function retrieves a specific cookbook by calling the api_read_cookbook function from the - moonshot.api module using the cookbook name provided in the args. + moonshot.api module using the cookbook ID provided in the args. It then displays the retrieved cookbook using the display_view_cookbook function. Args: args: A namespace object from argparse. It should have the following attribute: - cookbook (str): The id of the cookbook to view. + cookbook (str): The ID of the cookbook to view. + + Raises: + TypeError: If the 'cookbook' argument is not a string or is None. Returns: None """ try: + if ( + not isinstance(args.cookbook, str) + or not args.cookbook + or args.cookbook is None + ): + raise TypeError(ERROR_BENCHMARK_VIEW_COOKBOOK_COOKBOOK_VALIDATION) + cookbook_info = api_read_cookbook(args.cookbook) - display_view_cookbook(cookbook_info) + _display_view_cookbook(cookbook_info) + except Exception as e: print(f"[view_cookbook]: {str(e)}") @@ -132,48 +222,105 @@ def run_cookbook(args) -> None: runner_proc_module (str): The runner processing module to use. result_proc_module (str): The result processing module to use. + Raises: + TypeError: If any of the required arguments are not of the expected type or are None. + ValueError: If the 'cookbooks' or 'endpoints' arguments are not lists of strings after evaluation. + RuntimeError: If no results are found after running the cookbooks. + Returns: None """ try: - name = args.name + if not isinstance(args.name, str) or not args.name or args.name is None: + raise TypeError(ERROR_BENCHMARK_RUN_COOKBOOK_NAME_VALIDATION) + + if ( + not isinstance(args.cookbooks, str) + or not args.cookbooks + or args.cookbooks is None + ): + raise TypeError(ERROR_BENCHMARK_RUN_COOKBOOK_COOKBOOKS_VALIDATION) + + if ( + not isinstance(args.endpoints, str) + or not args.endpoints + or args.endpoints is None + ): + raise TypeError(ERROR_BENCHMARK_RUN_COOKBOOK_ENDPOINTS_VALIDATION) + + if isinstance(args.num_of_prompts, bool) or not isinstance( + args.num_of_prompts, int + ): + raise TypeError(ERROR_BENCHMARK_RUN_COOKBOOK_NUM_OF_PROMPTS_VALIDATION) + + if isinstance(args.random_seed, bool) or not isinstance(args.random_seed, int): + raise TypeError(ERROR_BENCHMARK_RUN_COOKBOOK_RANDOM_SEED_VALIDATION) + + if ( + not isinstance(args.system_prompt, str) + or not args.system_prompt + or args.system_prompt is None + ): + raise TypeError(ERROR_BENCHMARK_RUN_COOKBOOK_SYS_PROMPT_VALIDATION) + + if ( + not isinstance(args.runner_proc_module, str) + or not args.runner_proc_module + or args.runner_proc_module is None + ): + raise TypeError(ERROR_BENCHMARK_RUN_COOKBOOK_RUNNER_PROC_MOD_VALIDATION) + + if ( + not isinstance(args.result_proc_module, str) + or not args.result_proc_module + or args.result_proc_module is None + ): + raise TypeError(ERROR_BENCHMARK_RUN_COOKBOOK_RESULT_PROC_MOD_VALIDATION) + cookbooks = literal_eval(args.cookbooks) + if not ( + isinstance(cookbooks, list) + and all(isinstance(item, str) for item in cookbooks) + ): + raise TypeError(ERROR_BENCHMARK_RUN_COOKBOOK_COOKBOOKS_VALIDATION_1) + endpoints = literal_eval(args.endpoints) - num_of_prompts = args.num_of_prompts - random_seed = args.random_seed - system_prompt = args.system_prompt - runner_proc_module = args.runner_proc_module - result_proc_module = args.result_proc_module + if not ( + isinstance(endpoints, list) + and all(isinstance(item, str) for item in endpoints) + ): + raise TypeError(ERROR_BENCHMARK_RUN_COOKBOOK_ENDPOINTS_VALIDATION_1) # Run the cookbooks with the defined endpoints - slugify_id = slugify(name, lowercase=True) + slugify_id = slugify(args.name, lowercase=True) if slugify_id in api_get_all_runner_name(): cb_runner = api_load_runner(slugify_id) else: - cb_runner = api_create_runner(name, endpoints) + cb_runner = api_create_runner(args.name, endpoints) - loop = asyncio.get_event_loop() - loop.run_until_complete( - cb_runner.run_cookbooks( + async def run(): + await cb_runner.run_cookbooks( cookbooks, - num_of_prompts, - random_seed, - system_prompt, - runner_proc_module, - result_proc_module, + args.num_of_prompts, + args.random_seed, + args.system_prompt, + args.runner_proc_module, + args.result_proc_module, ) - ) - cb_runner.close() + await cb_runner.close() + + loop = asyncio.get_event_loop() + loop.run_until_complete(run()) # Display results runner_runs = api_get_all_run(cb_runner.id) result_info = runner_runs[-1].get("results") if result_info: - show_cookbook_results( + _show_cookbook_results( cookbooks, endpoints, result_info, result_info["metadata"]["duration"] ) else: - raise RuntimeError("no run result generated") + raise RuntimeError(ERROR_BENCHMARK_RUN_COOKBOOK_NO_RESULT) except Exception as e: print(f"[run_cookbook]: {str(e)}") @@ -183,8 +330,8 @@ def update_cookbook(args) -> None: """ Update a specific cookbook. - This function updates a specific cookbook by calling the api_update_cookbook function from the - moonshot.api module using the cookbook name and update values provided in the args. + This function updates a specific cookbook by calling the api_update_cookbook function using the + cookbook name and update values provided in the args. Args: args: A namespace object from argparse. It should have the following attributes: @@ -192,13 +339,36 @@ def update_cookbook(args) -> None: update_values (str): A string representation of a list of tuples. Each tuple contains a key and a value to update in the cookbook. + Raises: + ValueError: If the 'cookbook' or 'update_values' arguments are not of the expected type or are None. + Returns: None """ try: + if ( + args.cookbook is None + or not isinstance(args.cookbook, str) + or not args.cookbook + ): + raise ValueError(ERROR_BENCHMARK_UPDATE_COOKBOOK_COOKBOOK_VALIDATION) + + if ( + args.update_values is None + or not isinstance(args.update_values, str) + or not args.update_values + ): + raise ValueError(ERROR_BENCHMARK_UPDATE_COOKBOOK_UPDATE_VALUES_VALIDATION) + cookbook = args.cookbook - update_values = dict(literal_eval(args.update_values)) + if literal_eval(args.update_values) and all( + isinstance(i, tuple) for i in literal_eval(args.update_values) + ): + update_values = dict(literal_eval(args.update_values)) + else: + raise ValueError(ERROR_BENCHMARK_UPDATE_COOKBOOK_UPDATE_VALUES_VALIDATION_1) api_update_cookbook(cookbook, **update_values) + print("[update_cookbook]: Cookbook updated.") except Exception as e: print(f"[update_cookbook]: {str(e)}") @@ -218,6 +388,9 @@ def delete_cookbook(args) -> None: args: A namespace object from argparse. It should have the following attribute: cookbook (str): The identifier of the cookbook to delete. + Raises: + ValueError: If the 'cookbook' argument is not a string or is None. + Returns: None """ @@ -228,7 +401,15 @@ def delete_cookbook(args) -> None: if confirmation.lower() != "y": console.print("[bold yellow]Cookbook deletion cancelled.[/]") return + try: + if ( + args.cookbook is None + or not isinstance(args.cookbook, str) + or not args.cookbook + ): + raise ValueError(ERROR_BENCHMARK_DELETE_COOKBOOK_COOKBOOK_VALIDATION) + api_delete_cookbook(args.cookbook) print("[delete_cookbook]: Cookbook deleted.") except Exception as e: @@ -248,6 +429,9 @@ def _display_cookbooks(cookbooks_list): Args: cookbooks_list (list): A list of dictionaries, where each dictionary contains the details of a cookbook. + + Returns: + None """ table = Table( title="List of Cookbooks", show_lines=True, expand=True, header_style="bold" @@ -265,7 +449,7 @@ def _display_cookbooks(cookbooks_list): console.print(table) -def display_view_cookbook(cookbook_info): +def _display_view_cookbook(cookbook_info): """ Display the cookbook information in a formatted table. @@ -313,10 +497,10 @@ def display_view_cookbook(cookbook_info): attack_strategies_info = display_view_list_format( "Attack Strategies", attack_strategies ) - grading_scale_info = display_view_grading_scale_format( + grading_scale_info = _display_view_grading_scale_format( "Grading Scale", grading_scale ) - stats_info = display_view_statistics_format("Statistics", stats) + stats_info = _display_view_statistics_format("Statistics", stats) recipe_info = ( f"[red]id: {id}[/red]\n\n[blue]{name}[/blue]\n{description}\n\n" @@ -331,11 +515,11 @@ def display_view_cookbook(cookbook_info): console.print("[red]There are no recipes found for the cookbook.[/red]") -def show_cookbook_results(cookbooks, endpoints, cookbook_results, duration): +def _show_cookbook_results(cookbooks, endpoints, cookbook_results, duration): """ Show the results of the cookbook benchmarking. - This function takes the cookbooks, endpoints, cookbook results, results file, and duration as arguments. + This function takes the cookbooks, endpoints, cookbook results, and duration as arguments. If there are results, it generates a table with the cookbook results and prints a message indicating where the results are saved. If there are no results, it prints a message indicating that no results were found. Finally, it prints the duration of the run. @@ -351,7 +535,7 @@ def show_cookbook_results(cookbooks, endpoints, cookbook_results, duration): """ if cookbook_results: # Display recipe results - generate_cookbook_table(cookbooks, endpoints, cookbook_results) + _generate_cookbook_table(cookbooks, endpoints, cookbook_results) else: console.print("[red]There are no results.[/red]") @@ -361,7 +545,7 @@ def show_cookbook_results(cookbooks, endpoints, cookbook_results, duration): console.print(run_stats) -def generate_cookbook_table(cookbooks: list, endpoints: list, results: dict) -> None: +def _generate_cookbook_table(cookbooks: list, endpoints: list, results: dict) -> None: """ Generate and display a table with the cookbook benchmarking results. diff --git a/moonshot/integrations/cli/benchmark/datasets.py b/moonshot/integrations/cli/benchmark/datasets.py index 4069797a..a712efb9 100644 --- a/moonshot/integrations/cli/benchmark/datasets.py +++ b/moonshot/integrations/cli/benchmark/datasets.py @@ -9,6 +9,13 @@ api_get_all_datasets, api_get_all_datasets_name, ) +from moonshot.integrations.cli.cli_errors import ( + ERROR_BENCHMARK_DELETE_DATASET_DATASET_VALIDATION, + ERROR_BENCHMARK_LIST_DATASETS_FIND_VALIDATION, + ERROR_BENCHMARK_LIST_DATASETS_PAGINATION_VALIDATION, + ERROR_BENCHMARK_LIST_DATASETS_PAGINATION_VALIDATION_1, + ERROR_BENCHMARK_VIEW_DATASET_DATASET_FILENAME_VALIDATION, +) from moonshot.integrations.cli.common.display_helper import display_view_str_format from moonshot.integrations.cli.utils.process_data import filter_data @@ -23,22 +30,43 @@ def list_datasets(args) -> list | None: List all available datasets. This function retrieves all available datasets by calling the api_get_all_datasets function from the - moonshot.api module. It then displays the datasets using the _display_datasets function. If an exception occurs, - it prints an error message. + moonshot.api module. It then filters the datasets based on the provided keyword and pagination arguments. + If there are no datasets, it prints a message indicating that no datasets were found. Args: - args: A namespace object from argparse. It should have an optional attribute: - find (str): Optional field to find dataset(s) with a keyword. - pagination (str): Optional field to paginate datasets. + args: A namespace object from argparse. It should have optional attributes: + find (str): Optional keyword to filter datasets. + pagination (str): Optional tuple to paginate datasets. Returns: - list | None: A list of Dataset or None if there is no result. + list | None: A list of datasets or None if there are no datasets. """ try: print("Listing datasets may take a while...") + if args.find is not None: + if not isinstance(args.find, str) or not args.find: + raise TypeError(ERROR_BENCHMARK_LIST_DATASETS_FIND_VALIDATION) + + if args.pagination is not None: + if not isinstance(args.pagination, str) or not args.pagination: + raise TypeError(ERROR_BENCHMARK_LIST_DATASETS_PAGINATION_VALIDATION) + try: + pagination = literal_eval(args.pagination) + if not ( + isinstance(pagination, tuple) + and len(pagination) == 2 + and all(isinstance(i, int) for i in pagination) + ): + raise ValueError( + ERROR_BENCHMARK_LIST_DATASETS_PAGINATION_VALIDATION_1 + ) + except (ValueError, SyntaxError): + raise ValueError(ERROR_BENCHMARK_LIST_DATASETS_PAGINATION_VALIDATION_1) + else: + pagination = () + datasets_list = api_get_all_datasets() keyword = args.find.lower() if args.find else "" - pagination = literal_eval(args.pagination) if args.pagination else () if datasets_list: filtered_datasets_list = filter_data(datasets_list, keyword, pagination) @@ -48,8 +76,10 @@ def list_datasets(args) -> list | None: console.print("[red]There are no datasets found.[/red]") return None + except Exception as e: print(f"[list_datasets]: {str(e)}") + return None def view_dataset(args) -> None: @@ -69,6 +99,13 @@ def view_dataset(args) -> None: """ try: print("Viewing datasets may take a while...") + if ( + not isinstance(args.dataset_filename, str) + or not args.dataset_filename + or args.dataset_filename is None + ): + raise TypeError(ERROR_BENCHMARK_VIEW_DATASET_DATASET_FILENAME_VALIDATION) + datasets_list = api_get_all_datasets() datasets_name_list = api_get_all_datasets_name() @@ -92,7 +129,7 @@ def delete_dataset(args) -> None: Args: args: A namespace object from argparse. It should have the following attribute: - dataset_name (str): The name of the dataset to delete. + dataset (str): The name of the dataset to delete. Returns: None @@ -104,7 +141,15 @@ def delete_dataset(args) -> None: if confirmation.lower() != "y": console.print("[bold yellow]Dataset deletion cancelled.[/]") return + try: + if ( + args.dataset is None + or not isinstance(args.dataset, str) + or not args.dataset + ): + raise ValueError(ERROR_BENCHMARK_DELETE_DATASET_DATASET_VALIDATION) + api_delete_dataset(args.dataset) print("[delete_dataset]: Dataset deleted.") except Exception as e: diff --git a/moonshot/integrations/cli/benchmark/metrics.py b/moonshot/integrations/cli/benchmark/metrics.py index 5e170ffd..92097bde 100644 --- a/moonshot/integrations/cli/benchmark/metrics.py +++ b/moonshot/integrations/cli/benchmark/metrics.py @@ -5,6 +5,13 @@ from rich.table import Table from moonshot.api import api_delete_metric, api_get_all_metric, api_get_all_metric_name +from moonshot.integrations.cli.cli_errors import ( + ERROR_BENCHMARK_DELETE_METRIC_METRIC_VALIDATION, + ERROR_BENCHMARK_LIST_METRICS_FIND_VALIDATION, + ERROR_BENCHMARK_LIST_METRICS_PAGINATION_VALIDATION, + ERROR_BENCHMARK_LIST_METRICS_PAGINATION_VALIDATION_1, + ERROR_BENCHMARK_VIEW_METRIC_METRIC_FILENAME_VALIDATION, +) from moonshot.integrations.cli.utils.process_data import filter_data console = Console() @@ -18,23 +25,44 @@ def list_metrics(args) -> list | None: List all available metrics. This function retrieves all available metrics by calling the api_get_all_metric function from the - moonshot.api module. It then displays the metrics using the _display_metrics function. If an exception occurs, - it prints an error message. + moonshot.api module. It then filters the metrics based on the provided keyword and pagination arguments. + If there are no metrics, it prints a message indicating that no metrics were found. Args: - args: A namespace object from argparse. It should have an optional attribute: - find (str): Optional field to find metric(s) with a keyword. - pagination (str): Optional field to paginate metrics. + args: A namespace object from argparse. It should have optional attributes: + find (str): Optional field to find metric(s) with a keyword. + pagination (str): Optional field to paginate metrics. Returns: - list | None: A list of Metric or None if there is no result. + list | None: A list of metrics or None if there are no metrics. """ try: print("Listing metrics may take a while...") + if args.find is not None: + if not isinstance(args.find, str) or not args.find: + raise TypeError(ERROR_BENCHMARK_LIST_METRICS_FIND_VALIDATION) + + if args.pagination is not None: + if not isinstance(args.pagination, str) or not args.pagination: + raise TypeError(ERROR_BENCHMARK_LIST_METRICS_PAGINATION_VALIDATION) + try: + pagination = literal_eval(args.pagination) + if not ( + isinstance(pagination, tuple) + and len(pagination) == 2 + and all(isinstance(i, int) for i in pagination) + ): + raise ValueError( + ERROR_BENCHMARK_LIST_METRICS_PAGINATION_VALIDATION_1 + ) + except (ValueError, SyntaxError): + raise ValueError(ERROR_BENCHMARK_LIST_METRICS_PAGINATION_VALIDATION_1) + else: + pagination = () + metrics_list = api_get_all_metric() keyword = args.find.lower() if args.find else "" - pagination = literal_eval(args.pagination) if args.pagination else () if metrics_list: filtered_metrics_list = filter_data(metrics_list, keyword, pagination) @@ -44,8 +72,10 @@ def list_metrics(args) -> list | None: console.print("[red]There are no metrics found.[/red]") return None + except Exception as e: print(f"[list_metrics]: {str(e)}") + return None def view_metric(args) -> None: @@ -65,6 +95,13 @@ def view_metric(args) -> None: """ try: print("Viewing metrics may take a while...") + if ( + not isinstance(args.metric_filename, str) + or not args.metric_filename + or args.metric_filename is None + ): + raise TypeError(ERROR_BENCHMARK_VIEW_METRIC_METRIC_FILENAME_VALIDATION) + metrics_list = api_get_all_metric() metrics_name_list = api_get_all_metric_name() @@ -100,7 +137,11 @@ def delete_metric(args) -> None: if confirmation.lower() != "y": console.print("[bold yellow]Metric deletion cancelled.[/]") return + try: + if args.metric is None or not isinstance(args.metric, str) or not args.metric: + raise ValueError(ERROR_BENCHMARK_DELETE_METRIC_METRIC_VALIDATION) + api_delete_metric(args.metric) print("[delete_metric]: Metric deleted.") except Exception as e: diff --git a/moonshot/integrations/cli/benchmark/recipe.py b/moonshot/integrations/cli/benchmark/recipe.py index 6cd8a562..fa5307c0 100644 --- a/moonshot/integrations/cli/benchmark/recipe.py +++ b/moonshot/integrations/cli/benchmark/recipe.py @@ -17,6 +17,41 @@ api_read_recipe, api_update_recipe, ) +from moonshot.integrations.cli.cli_errors import ( + ERROR_BENCHMARK_ADD_RECIPE_CATEGORIES_LIST_STR_VALIDATION, + ERROR_BENCHMARK_ADD_RECIPE_CATEGORIES_VALIDATION, + ERROR_BENCHMARK_ADD_RECIPE_DATASETS_LIST_STR_VALIDATION, + ERROR_BENCHMARK_ADD_RECIPE_DATASETS_VALIDATION, + ERROR_BENCHMARK_ADD_RECIPE_DESC_VALIDATION, + ERROR_BENCHMARK_ADD_RECIPE_GRADING_SCALE_DICT_STR_VALIDATION, + ERROR_BENCHMARK_ADD_RECIPE_GRADING_SCALE_VALIDATION, + ERROR_BENCHMARK_ADD_RECIPE_METRICS_LIST_STR_VALIDATION, + ERROR_BENCHMARK_ADD_RECIPE_METRICS_VALIDATION, + ERROR_BENCHMARK_ADD_RECIPE_NAME_VALIDATION, + ERROR_BENCHMARK_ADD_RECIPE_PROMPT_TEMPLATES_LIST_STR_VALIDATION, + ERROR_BENCHMARK_ADD_RECIPE_PROMPT_TEMPLATES_VALIDATION, + ERROR_BENCHMARK_ADD_RECIPE_TAGS_LIST_STR_VALIDATION, + ERROR_BENCHMARK_ADD_RECIPE_TAGS_VALIDATION, + ERROR_BENCHMARK_DELETE_RECIPE_RECIPE_VALIDATION, + ERROR_BENCHMARK_LIST_RECIPES_FIND_VALIDATION, + ERROR_BENCHMARK_LIST_RECIPES_PAGINATION_VALIDATION, + ERROR_BENCHMARK_LIST_RECIPES_PAGINATION_VALIDATION_1, + ERROR_BENCHMARK_RUN_RECIPE_ENDPOINTS_VALIDATION, + ERROR_BENCHMARK_RUN_RECIPE_ENDPOINTS_VALIDATION_1, + ERROR_BENCHMARK_RUN_RECIPE_NAME_VALIDATION, + ERROR_BENCHMARK_RUN_RECIPE_NO_RESULT, + ERROR_BENCHMARK_RUN_RECIPE_NUM_OF_PROMPTS_VALIDATION, + ERROR_BENCHMARK_RUN_RECIPE_RANDOM_SEED_VALIDATION, + ERROR_BENCHMARK_RUN_RECIPE_RECIPES_VALIDATION, + ERROR_BENCHMARK_RUN_RECIPE_RECIPES_VALIDATION_1, + ERROR_BENCHMARK_RUN_RECIPE_RESULT_PROC_MOD_VALIDATION, + ERROR_BENCHMARK_RUN_RECIPE_RUNNER_PROC_MOD_VALIDATION, + ERROR_BENCHMARK_RUN_RECIPE_SYS_PROMPT_VALIDATION, + ERROR_BENCHMARK_UPDATE_RECIPE_RECIPE_VALIDATION, + ERROR_BENCHMARK_UPDATE_RECIPE_UPDATE_VALUES_VALIDATION, + ERROR_BENCHMARK_UPDATE_RECIPE_UPDATE_VALUES_VALIDATION_1, + ERROR_BENCHMARK_VIEW_RECIPE_RECIPE_VALIDATION, +) from moonshot.integrations.cli.common.display_helper import display_view_list_format from moonshot.integrations.cli.utils.process_data import filter_data @@ -37,23 +72,114 @@ def add_recipe(args) -> None: Args: args (argparse.Namespace): The arguments provided to the command line interface. - Expected keys are name, description, tags, categories, dataset, prompt_templates, metrics and grading_scale. + Expected keys are name, description, tags, categories, datasets, prompt_templates, metrics, and grading_scale. Returns: None Raises: - Exception: If there is an error during the creation of the recipe or the arguments cannot be evaluated. + TypeError: If any of the required arguments are not strings or are None. + ValueError: If the evaluated arguments are not of the expected types. """ try: - tags = literal_eval(args.tags) if args.tags else [] + if not isinstance(args.name, str) or not args.name or args.name is None: + raise TypeError(ERROR_BENCHMARK_ADD_RECIPE_NAME_VALIDATION) + + if ( + not isinstance(args.description, str) + or not args.description + or args.description is None + ): + raise TypeError(ERROR_BENCHMARK_ADD_RECIPE_DESC_VALIDATION) + + if not isinstance(args.tags, str) or not args.tags or args.tags is None: + raise TypeError(ERROR_BENCHMARK_ADD_RECIPE_TAGS_VALIDATION) + + if ( + not isinstance(args.categories, str) + or not args.categories + or args.categories is None + ): + raise TypeError(ERROR_BENCHMARK_ADD_RECIPE_CATEGORIES_VALIDATION) + + if ( + not isinstance(args.datasets, str) + or not args.datasets + or args.datasets is None + ): + raise TypeError(ERROR_BENCHMARK_ADD_RECIPE_DATASETS_VALIDATION) + + if ( + not isinstance(args.prompt_templates, str) + or not args.prompt_templates + or args.prompt_templates is None + ): + raise TypeError(ERROR_BENCHMARK_ADD_RECIPE_PROMPT_TEMPLATES_VALIDATION) + + if ( + not isinstance(args.metrics, str) + or not args.metrics + or args.metrics is None + ): + raise TypeError(ERROR_BENCHMARK_ADD_RECIPE_METRICS_VALIDATION) + + if ( + not isinstance(args.grading_scale, str) + or not args.grading_scale + or args.grading_scale is None + ): + raise TypeError(ERROR_BENCHMARK_ADD_RECIPE_GRADING_SCALE_VALIDATION) + + tags = literal_eval(args.tags) categories = literal_eval(args.categories) datasets = literal_eval(args.datasets) - prompt_templates = ( - literal_eval(args.prompt_templates) if args.prompt_templates else [] - ) + prompt_templates = literal_eval(args.prompt_templates) metrics = literal_eval(args.metrics) - grading_scale = literal_eval(args.grading_scale) if args.grading_scale else {} + grading_scale = literal_eval(args.grading_scale) + + if not (isinstance(tags, list) and all(isinstance(tag, str) for tag in tags)): + raise ValueError(ERROR_BENCHMARK_ADD_RECIPE_TAGS_LIST_STR_VALIDATION) + + if not ( + isinstance(categories, list) + and all(isinstance(category, str) for category in categories) + ): + raise ValueError(ERROR_BENCHMARK_ADD_RECIPE_CATEGORIES_LIST_STR_VALIDATION) + + if not ( + isinstance(datasets, list) + and all(isinstance(dataset, str) for dataset in datasets) + ): + raise ValueError(ERROR_BENCHMARK_ADD_RECIPE_DATASETS_LIST_STR_VALIDATION) + + if not ( + isinstance(prompt_templates, list) + and all( + isinstance(prompt_template, str) for prompt_template in prompt_templates + ) + ): + raise ValueError( + ERROR_BENCHMARK_ADD_RECIPE_PROMPT_TEMPLATES_LIST_STR_VALIDATION + ) + + if not ( + isinstance(metrics, list) + and all(isinstance(metric, str) for metric in metrics) + ): + raise ValueError(ERROR_BENCHMARK_ADD_RECIPE_METRICS_LIST_STR_VALIDATION) + + if not ( + isinstance(grading_scale, dict) + and all( + isinstance(gs, list) + and len(gs) == 2 + and all(isinstance(value, int) for value in gs) + for gs in grading_scale.values() + ) + ): + raise ValueError( + ERROR_BENCHMARK_ADD_RECIPE_GRADING_SCALE_DICT_STR_VALIDATION + ) new_recipe_id = api_create_recipe( args.name, @@ -79,18 +205,42 @@ def list_recipes(args) -> list | None: It then displays the retrieved recipes using the _display_recipes function. Args: - args: A namespace object from argparse. It should have an optional attribute: - find (str): Optional field to find recipe(s) with a keyword. - pagination (str): Optional field to paginate recipes. + args: A namespace object from argparse. It should have optional attributes: + find (str): Optional field to find recipe(s) with a keyword. + pagination (str): Optional field to paginate recipes. Returns: - list | None: A list of Recipe or None if there is no result. - """ + list | None: A list of recipes or None if there is no result. + Raises: + TypeError: If the 'find' or 'pagination' arguments are not strings or are invalid. + ValueError: If the 'pagination' argument cannot be evaluated into a tuple of two integers. + """ try: + if args.find is not None: + if not isinstance(args.find, str) or not args.find: + raise TypeError(ERROR_BENCHMARK_LIST_RECIPES_FIND_VALIDATION) + + if args.pagination is not None: + if not isinstance(args.pagination, str) or not args.pagination: + raise TypeError(ERROR_BENCHMARK_LIST_RECIPES_PAGINATION_VALIDATION) + try: + pagination = literal_eval(args.pagination) + if not ( + isinstance(pagination, tuple) + and len(pagination) == 2 + and all(isinstance(i, int) for i in pagination) + ): + raise ValueError( + ERROR_BENCHMARK_LIST_RECIPES_PAGINATION_VALIDATION_1 + ) + except (ValueError, SyntaxError): + raise ValueError(ERROR_BENCHMARK_LIST_RECIPES_PAGINATION_VALIDATION_1) + else: + pagination = () + recipes_list = api_get_all_recipe() keyword = args.find.lower() if args.find else "" - pagination = literal_eval(args.pagination) if args.pagination else () if recipes_list: filtered_recipes_list = filter_data(recipes_list, keyword, pagination) @@ -103,6 +253,7 @@ def list_recipes(args) -> list | None: except Exception as e: print(f"[list_recipes]: {str(e)}") + return None def view_recipe(args) -> None: @@ -111,7 +262,7 @@ def view_recipe(args) -> None: This function retrieves a specific recipe by calling the api_read_recipe function from the moonshot.api module using the recipe name provided in the args. - It then displays the retrieved recipe using the display_view_recipe function. + It then displays the retrieved recipe using the _display_recipes function. Args: args: A namespace object from argparse. It should have the following attribute: @@ -119,8 +270,14 @@ def view_recipe(args) -> None: Returns: None + + Raises: + TypeError: If the 'recipe' argument is not a string or is None. """ try: + if not isinstance(args.recipe, str) or not args.recipe or args.recipe is None: + raise TypeError(ERROR_BENCHMARK_VIEW_RECIPE_RECIPE_VALIDATION) + recipe_info = api_read_recipe(args.recipe) _display_recipes([recipe_info]) except Exception as e: @@ -148,46 +305,103 @@ def run_recipe(args) -> None: Returns: None + + Raises: + TypeError: If any of the required arguments are not of the expected types or are None. + ValueError: If the 'recipes' or 'endpoints' arguments cannot be evaluated into lists of strings. + RuntimeError: If no results are found after running the recipes. """ try: - name = args.name + if not isinstance(args.name, str) or not args.name or args.name is None: + raise TypeError(ERROR_BENCHMARK_RUN_RECIPE_NAME_VALIDATION) + + if ( + not isinstance(args.recipes, str) + or not args.recipes + or args.recipes is None + ): + raise TypeError(ERROR_BENCHMARK_RUN_RECIPE_RECIPES_VALIDATION) + + if ( + not isinstance(args.endpoints, str) + or not args.endpoints + or args.endpoints is None + ): + raise TypeError(ERROR_BENCHMARK_RUN_RECIPE_ENDPOINTS_VALIDATION) + + if isinstance(args.num_of_prompts, bool) or not isinstance( + args.num_of_prompts, int + ): + raise TypeError(ERROR_BENCHMARK_RUN_RECIPE_NUM_OF_PROMPTS_VALIDATION) + + if isinstance(args.random_seed, bool) or not isinstance(args.random_seed, int): + raise TypeError(ERROR_BENCHMARK_RUN_RECIPE_RANDOM_SEED_VALIDATION) + + if ( + not isinstance(args.system_prompt, str) + or not args.system_prompt + or args.system_prompt is None + ): + raise TypeError(ERROR_BENCHMARK_RUN_RECIPE_SYS_PROMPT_VALIDATION) + + if ( + not isinstance(args.runner_proc_module, str) + or not args.runner_proc_module + or args.runner_proc_module is None + ): + raise TypeError(ERROR_BENCHMARK_RUN_RECIPE_RUNNER_PROC_MOD_VALIDATION) + + if ( + not isinstance(args.result_proc_module, str) + or not args.result_proc_module + or args.result_proc_module is None + ): + raise TypeError(ERROR_BENCHMARK_RUN_RECIPE_RESULT_PROC_MOD_VALIDATION) + recipes = literal_eval(args.recipes) + if not ( + isinstance(recipes, list) and all(isinstance(item, str) for item in recipes) + ): + raise TypeError(ERROR_BENCHMARK_RUN_RECIPE_RECIPES_VALIDATION_1) + endpoints = literal_eval(args.endpoints) - num_of_prompts = args.num_of_prompts - random_seed = args.random_seed - system_prompt = args.system_prompt - runner_proc_module = args.runner_proc_module - result_proc_module = args.result_proc_module + if not ( + isinstance(endpoints, list) + and all(isinstance(item, str) for item in endpoints) + ): + raise TypeError(ERROR_BENCHMARK_RUN_RECIPE_ENDPOINTS_VALIDATION_1) # Run the recipes with the defined endpoints - slugify_id = slugify(name, lowercase=True) + slugify_id = slugify(args.name, lowercase=True) if slugify_id in api_get_all_runner_name(): rec_runner = api_load_runner(slugify_id) else: - rec_runner = api_create_runner(name, endpoints) + rec_runner = api_create_runner(args.name, endpoints) - loop = asyncio.get_event_loop() - loop.run_until_complete( - rec_runner.run_recipes( + async def run(): + await rec_runner.run_recipes( recipes, - num_of_prompts, - random_seed, - system_prompt, - runner_proc_module, - result_proc_module, + args.num_of_prompts, + args.random_seed, + args.system_prompt, + args.runner_proc_module, + args.result_proc_module, ) - ) - rec_runner.close() + await rec_runner.close() + + loop = asyncio.get_event_loop() + loop.run_until_complete(run()) # Display results runner_runs = api_get_all_run(rec_runner.id) result_info = runner_runs[-1].get("results") if result_info: - show_recipe_results( + _show_recipe_results( recipes, endpoints, result_info, result_info["metadata"]["duration"] ) else: - raise RuntimeError("no run result generated") + raise RuntimeError(ERROR_BENCHMARK_RUN_RECIPE_NO_RESULT) + except Exception as e: print(f"[run_recipe]: {str(e)}") @@ -207,11 +421,31 @@ def update_recipe(args) -> None: Returns: None + + Raises: + ValueError: If the 'recipe' or 'update_values' arguments are not strings or are None. + ValueError: If the 'update_values' argument cannot be evaluated into a list of tuples. """ try: + if args.recipe is None or not isinstance(args.recipe, str) or not args.recipe: + raise ValueError(ERROR_BENCHMARK_UPDATE_RECIPE_RECIPE_VALIDATION) + + if ( + args.update_values is None + or not isinstance(args.update_values, str) + or not args.update_values + ): + raise ValueError(ERROR_BENCHMARK_UPDATE_RECIPE_UPDATE_VALUES_VALIDATION) + recipe = args.recipe - update_values = dict(literal_eval(args.update_values)) + if literal_eval(args.update_values) and all( + isinstance(i, tuple) for i in literal_eval(args.update_values) + ): + update_values = dict(literal_eval(args.update_values)) + else: + raise ValueError(ERROR_BENCHMARK_UPDATE_RECIPE_UPDATE_VALUES_VALIDATION_1) api_update_recipe(recipe, **update_values) + print("[update_recipe]: Recipe updated.") except Exception as e: print(f"[update_recipe]: {str(e)}") @@ -232,6 +466,9 @@ def delete_recipe(args) -> None: Returns: None + + Raises: + ValueError: If the 'recipe' argument is not a string or is None. """ # Confirm with the user before deleting a recipe confirmation = console.input( @@ -240,7 +477,11 @@ def delete_recipe(args) -> None: if confirmation.lower() != "y": console.print("[bold yellow]Recipe deletion cancelled.[/]") return + try: + if args.recipe is None or not isinstance(args.recipe, str) or not args.recipe: + raise ValueError(ERROR_BENCHMARK_DELETE_RECIPE_RECIPE_VALIDATION) + api_delete_recipe(args.recipe) print("[delete_recipe]: Recipe deleted.") except Exception as e: @@ -250,7 +491,7 @@ def delete_recipe(args) -> None: # ------------------------------------------------------------------------------ # Helper functions: Display on cli # ------------------------------------------------------------------------------ -def display_view_grading_scale_format(title: str, grading_scale: dict) -> str: +def _display_view_grading_scale_format(title: str, grading_scale: dict) -> str: """ Format the grading scale for display. @@ -275,7 +516,7 @@ def display_view_grading_scale_format(title: str, grading_scale: dict) -> str: return f"[blue]{title}[/blue]: nil" -def display_view_statistics_format(title: str, stats: dict) -> str: +def _display_view_statistics_format(title: str, stats: dict) -> str: """ Format the statistics for display. @@ -348,10 +589,10 @@ def _display_recipes(recipes_list: list) -> None: "Prompt Templates", prompt_templates ) metrics_info = display_view_list_format("Metrics", metrics) - grading_scale_info = display_view_grading_scale_format( + grading_scale_info = _display_view_grading_scale_format( "Grading Scale", grading_scale ) - stats_info = display_view_statistics_format("Statistics", stats) + stats_info = _display_view_statistics_format("Statistics", stats) recipe_info = ( f"[red]id: {id}[/red]\n\n[blue]{name}[/blue]\n{description}\n\n" @@ -364,7 +605,7 @@ def _display_recipes(recipes_list: list) -> None: console.print(table) -def show_recipe_results(recipes, endpoints, recipe_results, duration): +def _show_recipe_results(recipes, endpoints, recipe_results, duration): """ Show the results of the recipe benchmarking. @@ -384,7 +625,7 @@ def show_recipe_results(recipes, endpoints, recipe_results, duration): """ if recipe_results: # Display recipe results - generate_recipe_table(recipes, endpoints, recipe_results) + _generate_recipe_table(recipes, endpoints, recipe_results) else: console.print("[red]There are no results.[/red]") @@ -394,7 +635,7 @@ def show_recipe_results(recipes, endpoints, recipe_results, duration): console.print(run_stats) -def generate_recipe_table(recipes: list, endpoints: list, results: dict) -> None: +def _generate_recipe_table(recipes: list, endpoints: list, results: dict) -> None: """ Generate and display a table of recipe results. diff --git a/moonshot/integrations/cli/benchmark/result.py b/moonshot/integrations/cli/benchmark/result.py index 15231453..95648d87 100644 --- a/moonshot/integrations/cli/benchmark/result.py +++ b/moonshot/integrations/cli/benchmark/result.py @@ -5,8 +5,17 @@ from rich.table import Table from moonshot.api import api_delete_result, api_get_all_result, api_read_result -from moonshot.integrations.cli.benchmark.cookbook import show_cookbook_results -from moonshot.integrations.cli.benchmark.recipe import show_recipe_results +from moonshot.integrations.cli.benchmark.cookbook import _show_cookbook_results +from moonshot.integrations.cli.benchmark.recipe import _show_recipe_results +from moonshot.integrations.cli.cli_errors import ( + ERROR_BENCHMARK_DELETE_RESULT_RESULT_VALIDATION, + ERROR_BENCHMARK_LIST_RESULTS_FIND_VALIDATION, + ERROR_BENCHMARK_LIST_RESULTS_PAGINATION_VALIDATION, + ERROR_BENCHMARK_LIST_RESULTS_PAGINATION_VALIDATION_1, + ERROR_BENCHMARK_VIEW_RESULT_METADATA_INVALID_VALIDATION, + ERROR_BENCHMARK_VIEW_RESULT_METADATA_VALIDATION, + ERROR_BENCHMARK_VIEW_RESULT_RESULT_FILENAME_VALIDATION, +) from moonshot.integrations.cli.common.display_helper import ( display_view_list_format, display_view_str_format, @@ -23,23 +32,44 @@ def list_results(args) -> list | None: """ List all available results. - This function retrieves all available results by calling the api_get_all_result_name function from the - moonshot.api module. It then creates a table with the result id and name. If there are no results, it prints a - message indicating that no results were found. + This function retrieves all available results by calling the api_get_all_result function from the + moonshot.api module. It then filters the results based on the provided keyword and pagination arguments. + If there are no results, it prints a message indicating that no results were found. Args: - args: A namespace object from argparse. It should have an optional attribute: - find (str): Optional field to find result(s) with a keyword. - pagination (str): Optional field to paginate results. + args (argparse.Namespace): The arguments provided to the command line interface. + find (str): Optional field to find result(s) with a keyword. + pagination (str): Optional field to paginate results. Returns: - list | None: A list of Result or None if there is no result. + list | None: A list of results or None if there are no results. """ try: + if args.find is not None: + if not isinstance(args.find, str) or not args.find: + raise TypeError(ERROR_BENCHMARK_LIST_RESULTS_FIND_VALIDATION) + + if args.pagination is not None: + if not isinstance(args.pagination, str) or not args.pagination: + raise TypeError(ERROR_BENCHMARK_LIST_RESULTS_PAGINATION_VALIDATION) + try: + pagination = literal_eval(args.pagination) + if not ( + isinstance(pagination, tuple) + and len(pagination) == 2 + and all(isinstance(i, int) for i in pagination) + ): + raise ValueError( + ERROR_BENCHMARK_LIST_RESULTS_PAGINATION_VALIDATION_1 + ) + except (ValueError, SyntaxError): + raise ValueError(ERROR_BENCHMARK_LIST_RESULTS_PAGINATION_VALIDATION_1) + else: + pagination = () + results_list = api_get_all_result() keyword = args.find.lower() if args.find else "" - pagination = literal_eval(args.pagination) if args.pagination else () if results_list: filtered_results_list = filter_data(results_list, keyword, pagination) @@ -52,6 +82,7 @@ def list_results(args) -> list | None: except Exception as e: print(f"[list_results]: {str(e)}") + return None def view_result(args) -> None: @@ -60,24 +91,34 @@ def view_result(args) -> None: This function retrieves a specific result by calling the api_read_result function from the moonshot.api module using the result filename provided in the args. - It then checks if the result filename starts with "cookbook". If it does, it displays the result using the - display_view_cookbook_result function. Otherwise, it uses the display_view_recipe_result function. + It then checks the metadata of the result to determine whether to display it as a cookbook or recipe result. Args: - args: A namespace object from argparse. It should have the following attribute: + args (argparse.Namespace): The arguments provided to the command line interface. result_filename (str): The filename of the result to view. Returns: None """ try: + if ( + not isinstance(args.result_filename, str) + or not args.result_filename + or args.result_filename is None + ): + raise TypeError(ERROR_BENCHMARK_VIEW_RESULT_RESULT_FILENAME_VALIDATION) + result_info = api_read_result(args.result_filename) - if result_info["metadata"].get("cookbooks"): - display_view_cookbook_result(result_info) - elif result_info["metadata"].get("recipes"): - display_view_recipe_result(result_info) + if isinstance(result_info, dict) and "metadata" in result_info: + if result_info["metadata"].get("cookbooks"): + _display_view_cookbook_result(result_info) + elif result_info["metadata"].get("recipes"): + _display_view_recipe_result(result_info) + else: + raise TypeError(ERROR_BENCHMARK_VIEW_RESULT_METADATA_INVALID_VALIDATION) else: - print("[view_result]: Unable to determine cookbook or recipe") + raise TypeError(ERROR_BENCHMARK_VIEW_RESULT_METADATA_VALIDATION) + except Exception as e: print(f"[view_result]: {str(e)}") @@ -92,7 +133,7 @@ def delete_result(args) -> None: prints an error message. Args: - args: A namespace object from argparse. It should have the following attribute: + args (argparse.Namespace): The arguments provided to the command line interface. result (str): The identifier of the result to delete. Returns: @@ -105,7 +146,11 @@ def delete_result(args) -> None: if confirmation.lower() != "y": console.print("[bold yellow]Result deletion cancelled.[/]") return + try: + if args.result is None or not isinstance(args.result, str) or not args.result: + raise ValueError(ERROR_BENCHMARK_DELETE_RESULT_RESULT_VALIDATION) + api_delete_result(args.result) print("[delete_result]: Result deleted.") except Exception as e: @@ -123,7 +168,7 @@ def _display_results(results_list): message indicating that no results were found. Args: - results_list (list): A list of results. Each result is a dictionary with keys 'id' and 'name'. + results_list (list): A list of results. Each result is a dictionary with keys 'id' and 'metadata'. Returns: None @@ -170,13 +215,12 @@ def _display_results(results_list): console.print(table) -def display_view_recipe_result(result_info): +def _display_view_recipe_result(result_info): """ Display the recipe result. - This function takes the result file and result info as arguments. It converts the result info into a dictionary - using the convert_string_tuples_in_dict function. It then retrieves the recipes, endpoints, and duration from the - converted result info. Finally, it calls the show_recipe_results function from the + This function takes the result info as an argument. It retrieves the recipes, endpoints, and duration from the + result info. Finally, it calls the show_recipe_results function from the moonshot.integrations.cli.benchmark.recipe module to display the recipe results. Args: @@ -188,16 +232,15 @@ def display_view_recipe_result(result_info): recipes = result_info["metadata"]["recipes"] endpoints = result_info["metadata"]["endpoints"] duration = result_info["metadata"]["duration"] - show_recipe_results(recipes, endpoints, result_info, duration) + _show_recipe_results(recipes, endpoints, result_info, duration) -def display_view_cookbook_result(result_info): +def _display_view_cookbook_result(result_info): """ Display the cookbook result. - This function takes the result file and result info as arguments. It converts the result info into a dictionary - using the convert_string_tuples_in_dict function. It then retrieves the cookbooks, endpoints, and duration from the - converted result info. Finally, it calls the show_cookbook_results function from the + This function takes the result info as an argument. It retrieves the cookbooks, endpoints, and duration from the + result info. Finally, it calls the show_cookbook_results function from the moonshot.integrations.cli.benchmark.cookbook module to display the cookbook results. Args: @@ -209,7 +252,7 @@ def display_view_cookbook_result(result_info): cookbooks = result_info["metadata"]["cookbooks"] endpoints = result_info["metadata"]["endpoints"] duration = result_info["metadata"]["duration"] - show_cookbook_results(cookbooks, endpoints, result_info, duration) + _show_cookbook_results(cookbooks, endpoints, result_info, duration) # ------------------------------------------------------------------------------ diff --git a/moonshot/integrations/cli/benchmark/run.py b/moonshot/integrations/cli/benchmark/run.py index f96f3d35..0e1fba0e 100644 --- a/moonshot/integrations/cli/benchmark/run.py +++ b/moonshot/integrations/cli/benchmark/run.py @@ -5,6 +5,12 @@ from rich.table import Table from moonshot.api import api_get_all_run +from moonshot.integrations.cli.cli_errors import ( + ERROR_BENCHMARK_LIST_RUNS_FIND_VALIDATION, + ERROR_BENCHMARK_LIST_RUNS_PAGINATION_VALIDATION, + ERROR_BENCHMARK_LIST_RUNS_PAGINATION_VALIDATION_1, + ERROR_BENCHMARK_VIEW_RUN_RUNNER_ID_VALIDATION, +) from moonshot.integrations.cli.common.display_helper import ( display_view_list_format, display_view_str_format, @@ -22,23 +28,41 @@ def list_runs(args) -> list | None: List all runs. This function retrieves all available runs by calling the api_get_all_run function from the - moonshot.api module. It then calls the _display_runs function to present the retrieved run information - in a user-friendly format on the command line interface. If an exception occurs during the retrieval - or display process, it prints an error message. + moonshot.api module. It then filters the runs based on the provided keyword and pagination arguments. + If there are no runs, it prints a message indicating that no runs were found. Args: - args: A namespace object from argparse. It should have an optional attribute: - find (str): Optional field to find run(s) with a keyword. - pagination (str): Optional field to paginate runs. + args (argparse.Namespace): The arguments provided to the command line interface. + find (str): Optional field to find run(s) with a keyword. + pagination (str): Optional field to paginate runs. Returns: - list | None: A list of Run or None if there is no result. + list | None: A list of runs or None if there are no runs. """ try: + if args.find is not None: + if not isinstance(args.find, str) or not args.find: + raise TypeError(ERROR_BENCHMARK_LIST_RUNS_FIND_VALIDATION) + + if args.pagination is not None: + if not isinstance(args.pagination, str) or not args.pagination: + raise TypeError(ERROR_BENCHMARK_LIST_RUNS_PAGINATION_VALIDATION) + try: + pagination = literal_eval(args.pagination) + if not ( + isinstance(pagination, tuple) + and len(pagination) == 2 + and all(isinstance(i, int) for i in pagination) + ): + raise ValueError(ERROR_BENCHMARK_LIST_RUNS_PAGINATION_VALIDATION_1) + except (ValueError, SyntaxError): + raise ValueError(ERROR_BENCHMARK_LIST_RUNS_PAGINATION_VALIDATION_1) + else: + pagination = () + runner_run_info = api_get_all_run() keyword = args.find.lower() if args.find else "" - pagination = literal_eval(args.pagination) if args.pagination else () if runner_run_info: filtered_runs_list = filter_data(runner_run_info, keyword, pagination) @@ -51,6 +75,7 @@ def list_runs(args) -> list | None: except Exception as e: print(f"[list_runs]: {str(e)}") + return None def view_run(args) -> None: @@ -62,13 +87,20 @@ def view_run(args) -> None: user-friendly format. Args: - args: A namespace object from argparse. It should have the following attribute: - runner (str): The identifier of the runner whose runs are to be viewed. + args (argparse.Namespace): The arguments provided to the command line interface. + runner_id (str): The identifier of the runner whose runs are to be viewed. Returns: None """ try: + if ( + not isinstance(args.runner_id, str) + or not args.runner_id + or args.runner_id is None + ): + raise TypeError(ERROR_BENCHMARK_VIEW_RUN_RUNNER_ID_VALIDATION) + runner_run_info = api_get_all_run(args.runner_id) _display_runs(runner_run_info) except Exception as e: @@ -151,7 +183,7 @@ def _display_runs(runs_list: list): # ------------------------------------------------------------------------------ # View run arguments view_run_args = cmd2.Cmd2ArgumentParser( - description="View a runner runs.", + description="View a runner's runs.", epilog="Example:\n view_run my-new-cookbook-runner", ) view_run_args.add_argument("runner_id", type=str, help="Name of the runner") diff --git a/moonshot/integrations/cli/benchmark/runner.py b/moonshot/integrations/cli/benchmark/runner.py index 82704a0b..ef84118e 100644 --- a/moonshot/integrations/cli/benchmark/runner.py +++ b/moonshot/integrations/cli/benchmark/runner.py @@ -10,6 +10,10 @@ api_load_session, api_read_runner, ) +from moonshot.integrations.cli.cli_errors import ( + ERROR_BENCHMARK_DELETE_RUNNER_RUNNER_VALIDATION, + ERROR_BENCHMARK_VIEW_RUNNER_RUNNER_VALIDATION, +) from moonshot.integrations.cli.common.display_helper import ( display_view_list_format, display_view_str_format, @@ -25,9 +29,9 @@ def list_runners() -> None: """ List all runners. - This function retrieves and displays information about all runners, including their associated runs and session - information. It fetches the data using the api_get_all_runner, api_get_all_run, and api_get_available_session_info - functions, then calls the display_runners function to present it in a user-friendly format. + Retrieves and displays information about all runners, including their associated runs and session + information. Fetches the data using the api_get_all_runner, api_get_all_run, and api_get_available_session_info + functions, then calls the _display_runners function to present it in a user-friendly format. Returns: None @@ -36,7 +40,7 @@ def list_runners() -> None: runner_info = api_get_all_runner() runner_run_info = api_get_all_run() _, runner_session_info = api_get_available_session_info() - display_runners(runner_info, runner_run_info, runner_session_info) + _display_runners(runner_info, runner_run_info, runner_session_info) except Exception as e: print(f"[list_runners]: {str(e)}") @@ -45,22 +49,25 @@ def view_runner(args) -> None: """ View a specific runner. - This function retrieves and displays information about a specific runner, including its associated runs and session - information. It uses the runner identifier provided in the arguments to fetch the data and then calls the - display_runners function to present it in a user-friendly format. + Retrieves and displays information about a specific runner, including its associated runs and session + information. Uses the runner identifier provided in the arguments to fetch the data and then calls the + _display_runners function to present it in a user-friendly format. Args: - args: A namespace object from argparse. It should have the following attribute: + args (argparse.Namespace): A namespace object from argparse. It should have the following attribute: runner (str): The identifier of the runner to view. Returns: None """ try: + if not isinstance(args.runner, str) or not args.runner or args.runner is None: + raise TypeError(ERROR_BENCHMARK_VIEW_RUNNER_RUNNER_VALIDATION) + runner_info = api_read_runner(args.runner) runner_run_info = api_get_all_run(args.runner) runner_session_info = api_load_session(args.runner) - display_runners([runner_info], runner_run_info, [runner_session_info]) + _display_runners([runner_info], runner_run_info, [runner_session_info]) except Exception as e: print(f"[view_runner]: {str(e)}") @@ -69,13 +76,13 @@ def delete_runner(args) -> None: """ Delete a runner. - This function deletes a runner with the specified identifier. It prompts the user for confirmation before proceeding + Deletes a runner with the specified identifier. Prompts the user for confirmation before proceeding with the deletion. If the user confirms, it calls the api_delete_runner function from the moonshot.api module to delete the runner. If the deletion is successful, it prints a confirmation message. If an exception occurs, it prints an error message. Args: - args: A namespace object from argparse. It should have the following attribute: + args (argparse.Namespace): A namespace object from argparse. It should have the following attribute: runner (str): The identifier of the runner to delete. Returns: @@ -88,7 +95,11 @@ def delete_runner(args) -> None: if confirmation.lower() != "y": console.print("[bold yellow]Runner deletion cancelled.[/]") return + try: + if args.runner is None or not isinstance(args.runner, str) or not args.runner: + raise ValueError(ERROR_BENCHMARK_DELETE_RUNNER_RUNNER_VALIDATION) + api_delete_runner(args.runner) print("[delete_runner]: Runner deleted.") except Exception as e: @@ -98,24 +109,22 @@ def delete_runner(args) -> None: # ------------------------------------------------------------------------------ # Helper functions: Display on cli # ------------------------------------------------------------------------------ -def display_runners( +def _display_runners( runner_list: list, runner_run_info_list: list, runner_session_info_list: list ) -> None: """ Display runners in a table format. - This function takes lists of runner information, run information, and session information, then displays them in a + Takes lists of runner information, run information, and session information, then displays them in a table format on the command line interface. Each runner is listed with details such as the runner's ID, name, description, number of runs, number of sessions, database file, and endpoints. Args: - runner_list: A list of dictionaries, where each dictionary contains information about a runner. - - runner_run_info_list: A list of dictionaries, where each dictionary contains information about a run - associated with a runner. - - runner_session_info_list: A list of dictionaries, where each dictionary contains information about a session - associated with a runner. + runner_list (list): A list of dictionaries, where each dictionary contains information about a runner. + runner_run_info_list (list): A list of dictionaries, where each dictionary contains information about a run + associated with a runner. + runner_session_info_list (list): A list of dictionaries, where each dictionary contains information about a + session associated with a runner. Returns: None diff --git a/moonshot/integrations/cli/cli_errors.py b/moonshot/integrations/cli/cli_errors.py new file mode 100644 index 00000000..04dcfa2a --- /dev/null +++ b/moonshot/integrations/cli/cli_errors.py @@ -0,0 +1,330 @@ +# ------------------------------------------------------------------------------ +# Benchmark - add_cookbook +# ------------------------------------------------------------------------------ +ERROR_BENCHMARK_ADD_COOKBOOK_NAME_VALIDATION = ( + "The 'name' argument must be a non-empty string and not None." +) +ERROR_BENCHMARK_ADD_COOKBOOK_DESC_VALIDATION = ( + "The 'description' argument must be a non-empty string and not None." +) +ERROR_BENCHMARK_ADD_COOKBOOK_RECIPES_VALIDATION = ( + "The 'recipes' argument must be a non-empty string and not None." +) +ERROR_BENCHMARK_ADD_COOKBOOK_RECIPES_LIST_STR_VALIDATION = ( + "The 'recipes' argument must be a list of strings after evaluation." +) + +# ------------------------------------------------------------------------------ +# Benchmark - list_cookbook +# ------------------------------------------------------------------------------ +ERROR_BENCHMARK_LIST_COOKBOOK_FIND_VALIDATION = ( + "The 'find' argument must be a non-empty string and not None." +) +ERROR_BENCHMARK_LIST_COOKBOOK_PAGINATION_VALIDATION = ( + "The 'pagination' argument must be a non-empty string and not None." +) +ERROR_BENCHMARK_LIST_COOKBOOK_PAGINATION_VALIDATION_1 = ( + "The 'pagination' argument must be a tuple of two integers." +) + +# ------------------------------------------------------------------------------ +# Benchmark - view_cookbook +# ------------------------------------------------------------------------------ +ERROR_BENCHMARK_VIEW_COOKBOOK_COOKBOOK_VALIDATION = ( + "The 'cookbook' argument must be a non-empty string and not None." +) + +# ------------------------------------------------------------------------------ +# Benchmark - run_cookbook +# ------------------------------------------------------------------------------ +ERROR_BENCHMARK_RUN_COOKBOOK_NAME_VALIDATION = ( + "The 'name' argument must be a non-empty string and not None." +) +ERROR_BENCHMARK_RUN_COOKBOOK_COOKBOOKS_VALIDATION = ( + "The 'cookbooks' argument must be a non-empty string and not None." +) +ERROR_BENCHMARK_RUN_COOKBOOK_COOKBOOKS_VALIDATION_1 = ( + "The 'cookbooks' argument must evaluate to a list of strings." +) +ERROR_BENCHMARK_RUN_COOKBOOK_ENDPOINTS_VALIDATION = ( + "The 'endpoints' argument must be a non-empty string and not None." +) +ERROR_BENCHMARK_RUN_COOKBOOK_ENDPOINTS_VALIDATION_1 = ( + "The 'endpoints' argument must evaluate to a list of strings." +) +ERROR_BENCHMARK_RUN_COOKBOOK_NUM_OF_PROMPTS_VALIDATION = ( + "The 'num_of_prompts' argument must be an integer." +) +ERROR_BENCHMARK_RUN_COOKBOOK_RANDOM_SEED_VALIDATION = ( + "The 'random_seed' argument must be an integer." +) +ERROR_BENCHMARK_RUN_COOKBOOK_SYS_PROMPT_VALIDATION = ( + "The 'system_prompt' argument must be a non-empty string and not None." +) +ERROR_BENCHMARK_RUN_COOKBOOK_RUNNER_PROC_MOD_VALIDATION = ( + "The 'runner_proc_module' argument must be a non-empty string and not None." +) +ERROR_BENCHMARK_RUN_COOKBOOK_RESULT_PROC_MOD_VALIDATION = ( + "The 'result_proc_module' argument must be a non-empty string and not None." +) +ERROR_BENCHMARK_RUN_COOKBOOK_NO_RESULT = "There are no results generated." + +# ------------------------------------------------------------------------------ +# Benchmark - update_cookbook +# ------------------------------------------------------------------------------ +ERROR_BENCHMARK_UPDATE_COOKBOOK_COOKBOOK_VALIDATION = ( + "The 'cookbook' argument must be a non-empty string and not None." +) +ERROR_BENCHMARK_UPDATE_COOKBOOK_UPDATE_VALUES_VALIDATION = ( + "The 'update_values' argument must be a non-empty string and not None." +) +ERROR_BENCHMARK_UPDATE_COOKBOOK_UPDATE_VALUES_VALIDATION_1 = ( + "The 'update_values' argument must evaluate to a list of tuples." +) + +# ------------------------------------------------------------------------------ +# Benchmark - delete_cookbook +# ------------------------------------------------------------------------------ +ERROR_BENCHMARK_DELETE_COOKBOOK_COOKBOOK_VALIDATION = ( + "The 'cookbook' argument must be a non-empty string and not None." +) + +# ------------------------------------------------------------------------------ +# Benchmark - list_datasets +# ------------------------------------------------------------------------------ +ERROR_BENCHMARK_LIST_DATASETS_FIND_VALIDATION = ( + "The 'find' argument must be a non-empty string and not None." +) +ERROR_BENCHMARK_LIST_DATASETS_PAGINATION_VALIDATION = ( + "The 'pagination' argument must be a non-empty string and not None." +) +ERROR_BENCHMARK_LIST_DATASETS_PAGINATION_VALIDATION_1 = ( + "The 'pagination' argument must be a tuple of two integers." +) + +# ------------------------------------------------------------------------------ +# Benchmark - view_dataset +# ------------------------------------------------------------------------------ +ERROR_BENCHMARK_VIEW_DATASET_DATASET_FILENAME_VALIDATION = ( + "The 'dataset_filename' argument must be a non-empty string and not None." +) + +# ------------------------------------------------------------------------------ +# Benchmark - delete_dataset +# ------------------------------------------------------------------------------ +ERROR_BENCHMARK_DELETE_DATASET_DATASET_VALIDATION = ( + "The 'dataset' argument must be a non-empty string and not None." +) + +# ------------------------------------------------------------------------------ +# Benchmark - list_metrics +# ------------------------------------------------------------------------------ +ERROR_BENCHMARK_LIST_METRICS_FIND_VALIDATION = ( + "The 'find' argument must be a non-empty string and not None." +) +ERROR_BENCHMARK_LIST_METRICS_PAGINATION_VALIDATION = ( + "The 'pagination' argument must be a non-empty string and not None." +) +ERROR_BENCHMARK_LIST_METRICS_PAGINATION_VALIDATION_1 = ( + "The 'pagination' argument must be a tuple of two integers." +) + +# ------------------------------------------------------------------------------ +# Benchmark - view_metric +# ------------------------------------------------------------------------------ +ERROR_BENCHMARK_VIEW_METRIC_METRIC_FILENAME_VALIDATION = ( + "The 'metric_filename' argument must be a non-empty string and not None." +) + +# ------------------------------------------------------------------------------ +# Benchmark - delete_metric +# ------------------------------------------------------------------------------ +ERROR_BENCHMARK_DELETE_METRIC_METRIC_VALIDATION = ( + "The 'metric' argument must be a non-empty string and not None." +) + +# ------------------------------------------------------------------------------ +# Benchmark - list_results +# ------------------------------------------------------------------------------ +ERROR_BENCHMARK_LIST_RESULTS_FIND_VALIDATION = ( + "The 'find' argument must be a non-empty string and not None." +) +ERROR_BENCHMARK_LIST_RESULTS_PAGINATION_VALIDATION = ( + "The 'pagination' argument must be a non-empty string and not None." +) +ERROR_BENCHMARK_LIST_RESULTS_PAGINATION_VALIDATION_1 = ( + "The 'pagination' argument must be a tuple of two integers." +) + +# ------------------------------------------------------------------------------ +# Benchmark - view_result +# ------------------------------------------------------------------------------ +ERROR_BENCHMARK_VIEW_RESULT_RESULT_FILENAME_VALIDATION = ( + "The 'result_filename' argument must be a non-empty string and not None." +) +ERROR_BENCHMARK_VIEW_RESULT_METADATA_VALIDATION = "The 'metadata' argument not found." +ERROR_BENCHMARK_VIEW_RESULT_METADATA_INVALID_VALIDATION = ( + "Unable to determine cookbook or recipe." +) + +# ------------------------------------------------------------------------------ +# Benchmark - delete_result +# ------------------------------------------------------------------------------ +ERROR_BENCHMARK_DELETE_RESULT_RESULT_VALIDATION = ( + "The 'result' argument must be a non-empty string and not None." +) + +# ------------------------------------------------------------------------------ +# Benchmark - list_runs +# ------------------------------------------------------------------------------ +ERROR_BENCHMARK_LIST_RUNS_FIND_VALIDATION = ( + "The 'find' argument must be a non-empty string and not None." +) +ERROR_BENCHMARK_LIST_RUNS_PAGINATION_VALIDATION = ( + "The 'pagination' argument must be a non-empty string and not None." +) +ERROR_BENCHMARK_LIST_RUNS_PAGINATION_VALIDATION_1 = ( + "The 'pagination' argument must be a tuple of two integers." +) + +# ------------------------------------------------------------------------------ +# Benchmark - view_run +# ------------------------------------------------------------------------------ +ERROR_BENCHMARK_VIEW_RUN_RUNNER_ID_VALIDATION = ( + "The 'runner_id' argument must be a non-empty string and not None." +) + +# ------------------------------------------------------------------------------ +# Benchmark - add_recipe +# ------------------------------------------------------------------------------ +ERROR_BENCHMARK_ADD_RECIPE_NAME_VALIDATION = ( + "The 'name' argument must be a non-empty string and not None." +) +ERROR_BENCHMARK_ADD_RECIPE_DESC_VALIDATION = ( + "The 'description' argument must be a non-empty string and not None." +) +ERROR_BENCHMARK_ADD_RECIPE_TAGS_VALIDATION = ( + "The 'tags' argument must be a non-empty string and not None." +) +ERROR_BENCHMARK_ADD_RECIPE_TAGS_LIST_STR_VALIDATION = ( + "The 'tags' argument must be a list of strings after evaluation." +) +ERROR_BENCHMARK_ADD_RECIPE_CATEGORIES_VALIDATION = ( + "The 'categories' argument must be a non-empty string and not None." +) +ERROR_BENCHMARK_ADD_RECIPE_CATEGORIES_LIST_STR_VALIDATION = ( + "The 'categories' argument must be a list of strings after evaluation." +) +ERROR_BENCHMARK_ADD_RECIPE_DATASETS_VALIDATION = ( + "The 'datasets' argument must be a non-empty string and not None." +) +ERROR_BENCHMARK_ADD_RECIPE_DATASETS_LIST_STR_VALIDATION = ( + "The 'datasets' argument must be a list of strings after evaluation." +) +ERROR_BENCHMARK_ADD_RECIPE_PROMPT_TEMPLATES_VALIDATION = ( + "The 'prompt_templates' argument must be a non-empty string and not None." +) +ERROR_BENCHMARK_ADD_RECIPE_PROMPT_TEMPLATES_LIST_STR_VALIDATION = ( + "The 'prompt_templates' argument must be a list of strings after evaluation." +) +ERROR_BENCHMARK_ADD_RECIPE_METRICS_VALIDATION = ( + "The 'metrics' argument must be a non-empty string and not None." +) +ERROR_BENCHMARK_ADD_RECIPE_METRICS_LIST_STR_VALIDATION = ( + "The 'metrics' argument must be a list of strings after evaluation." +) +ERROR_BENCHMARK_ADD_RECIPE_GRADING_SCALE_VALIDATION = ( + "The 'grading_scale' argument must be a non-empty string and not None." +) +ERROR_BENCHMARK_ADD_RECIPE_GRADING_SCALE_DICT_STR_VALIDATION = ( + "The 'grading_scale' argument must be a dictionary after evaluation." +) + +# ------------------------------------------------------------------------------ +# Benchmark - list_recipes +# ------------------------------------------------------------------------------ +ERROR_BENCHMARK_LIST_RECIPES_FIND_VALIDATION = ( + "The 'find' argument must be a non-empty string and not None." +) +ERROR_BENCHMARK_LIST_RECIPES_PAGINATION_VALIDATION = ( + "The 'pagination' argument must be a non-empty string and not None." +) +ERROR_BENCHMARK_LIST_RECIPES_PAGINATION_VALIDATION_1 = ( + "The 'pagination' argument must be a tuple of two integers." +) + +# ------------------------------------------------------------------------------ +# Benchmark - view_recipe +# ------------------------------------------------------------------------------ +ERROR_BENCHMARK_VIEW_RECIPE_RECIPE_VALIDATION = ( + "The 'recipe' argument must be a non-empty string and not None." +) + +# ------------------------------------------------------------------------------ +# Benchmark - run_recipe +# ------------------------------------------------------------------------------ +ERROR_BENCHMARK_RUN_RECIPE_NAME_VALIDATION = ( + "The 'name' argument must be a non-empty string and not None." +) +ERROR_BENCHMARK_RUN_RECIPE_RECIPES_VALIDATION = ( + "The 'recipes' argument must be a non-empty string and not None." +) +ERROR_BENCHMARK_RUN_RECIPE_RECIPES_VALIDATION_1 = ( + "The 'recipes' argument must evaluate to a list of strings." +) +ERROR_BENCHMARK_RUN_RECIPE_ENDPOINTS_VALIDATION = ( + "The 'endpoints' argument must be a non-empty string and not None." +) +ERROR_BENCHMARK_RUN_RECIPE_ENDPOINTS_VALIDATION_1 = ( + "The 'endpoints' argument must evaluate to a list of strings." +) +ERROR_BENCHMARK_RUN_RECIPE_NUM_OF_PROMPTS_VALIDATION = ( + "The 'num_of_prompts' argument must be an integer." +) +ERROR_BENCHMARK_RUN_RECIPE_RANDOM_SEED_VALIDATION = ( + "The 'random_seed' argument must be an integer." +) +ERROR_BENCHMARK_RUN_RECIPE_SYS_PROMPT_VALIDATION = ( + "The 'system_prompt' argument must be a non-empty string and not None." +) +ERROR_BENCHMARK_RUN_RECIPE_RUNNER_PROC_MOD_VALIDATION = ( + "The 'runner_proc_module' argument must be a non-empty string and not None." +) +ERROR_BENCHMARK_RUN_RECIPE_RESULT_PROC_MOD_VALIDATION = ( + "The 'result_proc_module' argument must be a non-empty string and not None." +) +ERROR_BENCHMARK_RUN_RECIPE_NO_RESULT = "There are no results generated." + +# ------------------------------------------------------------------------------ +# Benchmark - update_recipe +# ------------------------------------------------------------------------------ +ERROR_BENCHMARK_UPDATE_RECIPE_RECIPE_VALIDATION = ( + "The 'recipe' argument must be a non-empty string and not None." +) +ERROR_BENCHMARK_UPDATE_RECIPE_UPDATE_VALUES_VALIDATION = ( + "The 'update_values' argument must be a non-empty string and not None." +) +ERROR_BENCHMARK_UPDATE_RECIPE_UPDATE_VALUES_VALIDATION_1 = ( + "The 'update_values' argument must evaluate to a list of tuples." +) + +# ------------------------------------------------------------------------------ +# Benchmark - delete_recipe +# ------------------------------------------------------------------------------ +ERROR_BENCHMARK_DELETE_RECIPE_RECIPE_VALIDATION = ( + "The 'recipe' argument must be a non-empty string and not None." +) + +# ------------------------------------------------------------------------------ +# Benchmark - view_runner +# ------------------------------------------------------------------------------ +ERROR_BENCHMARK_VIEW_RUNNER_RUNNER_VALIDATION = ( + "The 'runner' argument must be a non-empty string and not None." +) + +# ------------------------------------------------------------------------------ +# Benchmark - delete_runner +# ------------------------------------------------------------------------------ +ERROR_BENCHMARK_DELETE_RUNNER_RUNNER_VALIDATION = ( + "The 'runner' argument must be a non-empty string and not None." +) diff --git a/moonshot/src/runners/runner.py b/moonshot/src/runners/runner.py index d4017f05..42696b0d 100644 --- a/moonshot/src/runners/runner.py +++ b/moonshot/src/runners/runner.py @@ -282,7 +282,7 @@ def get_available_items() -> tuple[list[str], list[RunnerArguments]]: logger.error(f"[Runner] Failed to get available runners: {str(e)}") raise e - def close(self) -> None: + async def close(self) -> None: """ Closes the runner instance. diff --git a/tests/unit-tests/cli/test_benchmark.py b/tests/unit-tests/cli/test_benchmark.py new file mode 100644 index 00000000..408f742c --- /dev/null +++ b/tests/unit-tests/cli/test_benchmark.py @@ -0,0 +1,1125 @@ +# from moonshot.integrations.cli.benchmark.datasets import list_datasets +# from moonshot.integrations.cli.benchmark.metrics import list_metrics +# from moonshot.integrations.cli.benchmark.result import list_results +# from moonshot.integrations.cli.benchmark.run import list_runs +# import pytest +# from io import StringIO +# from unittest.mock import patch +# from moonshot.integrations.cli.cli import CommandLineInterface +# from moonshot.api import api_set_environment_variables +# import shutil +# import os +# import argparse + +# from moonshot.integrations.cli.benchmark.recipe import list_recipes +# from moonshot.integrations.cli.benchmark.cookbook import list_cookbooks + + +# @pytest.fixture +# def cli(): +# return CommandLineInterface() + +# def run_command(cli: CommandLineInterface, command_list: list = []): +# for command in command_list: +# cli.onecmd_plus_hooks(command) + +# def run_command_table(cli, command): +# with patch('sys.stdout', new_callable=StringIO) as mock_stdout: +# cli.onecmd_plus_hooks(command) +# return mock_stdout.getvalue() + +# def perform_assertion(cli, command_list, expected_output, capsys): +# run_command(cli, command_list) +# captured = capsys.readouterr() +# if captured.out: +# assert captured.out.rstrip() == expected_output or expected_output in captured.out.rstrip() +# else: +# assert expected_output in captured.err.rstrip() + +# def perform_assertion_function_output(expected_output, returned_results, capsys): +# if returned_results: +# assert any(expected_output in returned_result.values() for returned_result in returned_results) +# else: +# captured = capsys.readouterr() +# if captured.out: +# assert captured.out.rstrip() == expected_output or expected_output in captured.out.rstrip() + + +# ut_data_dir = "tests/unit-tests/src/data" +# ut_sample_dir = "tests/unit-tests/common/samples" + +# class TestBenchmarkingCLI: +# @pytest.fixture(autouse=True) +# def init(self): +# # Set environment variables for result paths +# api_set_environment_variables( +# { +# "RUNNERS": f"{ut_data_dir}/runners/", +# "DATABASES": f"{ut_data_dir}/databases/", +# "DATABASES_MODULES": f"{ut_data_dir}/databases-modules/", +# "DATASETS": f"{ut_data_dir}/datasets/", +# "CONNECTORS": f"{ut_data_dir}/connectors/", +# "CONNECTORS_ENDPOINTS": f"{ut_data_dir}/connectors-endpoints/", +# "IO_MODULES": f"{ut_data_dir}/io-modules/", +# "ATTACK_MODULES": f"{ut_data_dir}/attack-modules/", +# "CONTEXT_STRATEGY": f"{ut_data_dir}/context-strategy/", +# "COOKBOOKS": f"{ut_data_dir}/cookbooks/", +# "METRICS": f"{ut_data_dir}/metrics/", +# "PROMPT_TEMPLATES": f"{ut_data_dir}/prompt-templates/", +# "RECIPES": f"{ut_data_dir}/recipes/", +# "RUNNERS": f"{ut_data_dir}/runners/", +# "RUNNERS_MODULES": f"{ut_data_dir}/runner-modules/", +# "RESULTS_MODULES": f"{ut_data_dir}/results-modules/", +# "RESULTS": f"{ut_data_dir}/results/", +# } +# ) + +# # Copy cookbooks +# shutil.copyfile( +# f"{ut_sample_dir}/chinese-safety-cookbook.json", +# f"{ut_data_dir}/cookbooks/chinese-safety-cookbook.json", +# ) + +# shutil.copyfile( +# f"{ut_sample_dir}/tamil-language-cookbook.json", +# f"{ut_data_dir}/cookbooks/tamil-language-cookbook.json", +# ) + +# # Copy recipes +# shutil.copyfile( +# f"{ut_sample_dir}/bbq.json", +# f"{ut_data_dir}/recipes/bbq.json", +# ) +# shutil.copyfile( +# f"{ut_sample_dir}/arc.json", +# f"{ut_data_dir}/recipes/arc.json", +# ) + +# # Copy dataset +# shutil.copyfile( +# f"{ut_sample_dir}/bbq-lite-age-disamb.json", +# f"{ut_data_dir}/datasets/bbq-lite-age-disamb.json", +# ) +# shutil.copyfile( +# f"{ut_sample_dir}/bbq-lite-age-ambiguous.json", +# f"{ut_data_dir}/datasets/bbq-lite-age-ambiguous.json", +# ) +# shutil.copyfile( +# f"{ut_sample_dir}/arc-easy.json", +# f"{ut_data_dir}/datasets/arc-easy.json", +# ) +# shutil.copyfile( +# f"{ut_sample_dir}/arc-challenge.json", +# f"{ut_data_dir}/datasets/arc-challenge.json", +# ) +# # Copy metrics +# shutil.copyfile( +# f"{ut_sample_dir}/bertscore.py", +# f"{ut_data_dir}/metrics/bertscore.py", +# ) +# shutil.copyfile( +# f"{ut_sample_dir}/bleuscore.py", +# f"{ut_data_dir}/metrics/bleuscore.py", +# ) +# shutil.copyfile( +# f"{ut_sample_dir}/exactstrmatch.py", +# f"{ut_data_dir}/metrics/exactstrmatch.py", +# ) + +# # Copy prompt templates +# shutil.copyfile( +# f"{ut_sample_dir}/analogical-similarity.json", +# f"{ut_data_dir}/prompt-templates/analogical-similarity.json", +# ) +# shutil.copyfile( +# f"{ut_sample_dir}/mmlu.json", +# f"{ut_data_dir}/prompt-templates/mmlu.json", +# ) +# shutil.copyfile( +# f"{ut_sample_dir}/mcq-template.json", +# f"{ut_data_dir}/prompt-templates/mcq-template.json", +# ) + +# # Copy attack modules +# shutil.copyfile( +# f"{ut_sample_dir}/charswap_attack.py", +# f"{ut_data_dir}/attack-modules/charswap_attack.py", +# ) + +# # Copy connector +# shutil.copyfile( +# f"{ut_sample_dir}/openai-connector.py", +# f"{ut_data_dir}/connectors/openai-connector.py", +# ) + +# # Copy connector endpoint +# shutil.copyfile( +# f"{ut_sample_dir}/openai-gpt35-turbo.json", +# f"{ut_data_dir}/connectors-endpoints/openai-gpt35-turbo.json", +# ) + +# # Copy runner module +# shutil.copyfile( +# f"{ut_sample_dir}/benchmarking.py", +# f"{ut_data_dir}/runner-modules/benchmarking.py", +# ) + +# # Copy results module +# shutil.copyfile( +# f"{ut_sample_dir}/benchmarking-result.py", +# f"{ut_data_dir}/results-modules/benchmarking-result.py", +# ) + +# # Copy first sample runner +# shutil.copyfile( +# f"{ut_sample_dir}/my-new-recipe-runner.json", +# f"{ut_data_dir}/runners/my-new-recipe-runner.json", +# ) + +# shutil.copyfile( +# f"{ut_sample_dir}/my-new-recipe-runner.db", +# f"{ut_data_dir}/databases/my-new-recipe-runner.db", +# ) + +# # Copy first sample result +# shutil.copyfile( +# f"{ut_sample_dir}/my-new-recipe-runner-result.json", +# f"{ut_data_dir}/results/my-new-recipe-runner-result.json", +# ) + +# # Copy second sample result +# shutil.copyfile( +# f"{ut_sample_dir}/sample-result.json", +# f"{ut_data_dir}/results/sample-result.json", +# ) + + +# # Setup complete, proceed with tests +# yield + +# benchmarking_files = [ +# f"{ut_data_dir}/cookbooks/chinese-safety-cookbook.json", +# f"{ut_data_dir}/recipes/bbq.json", +# f"{ut_data_dir}/recipes/arc.json", +# f"{ut_data_dir}/datasets/bbq-lite-age-disamb.json", +# f"{ut_data_dir}/datasets/bbq-lite-age-ambiguous.json", +# f"{ut_data_dir}/metrics/bertscore.py", +# f"{ut_data_dir}/metrics/bleuscore.py", +# f"{ut_data_dir}/prompt-templates/analogical-similarity.json", +# f"{ut_data_dir}/prompt-templates/mmlu.json", +# f"{ut_data_dir}/attack-modules/charswap_attack.py", +# f"{ut_data_dir}/connectors/openai-connector.py", +# f"{ut_data_dir}/connectors-endpoints/openai-gpt35-turbo.json", +# f"{ut_data_dir}/runner-modules/benchmarking.py", +# f"{ut_data_dir}/results-modules/benchmarking-result.py", +# f"{ut_data_dir}/datasets/arc-easy.json", +# f"{ut_data_dir}/metrics/exactstrmatch.py", +# f"{ut_data_dir}/prompt-templates/mcq-template.json", +# f"{ut_data_dir}/datasets/arc-challenge.json", +# f"{ut_data_dir}/runners/my-new-recipe-runner.json", +# f"{ut_data_dir}/databases/my-new-recipe-runner.db", +# f"{ut_data_dir}/runners/my-runner.json", +# f"{ut_data_dir}/databases/my-runner.db", +# f"{ut_data_dir}/results/my-new-recipe-runner-result.json", +# f"{ut_data_dir}/results/sample-result.json", +# f"{ut_data_dir}/cookbooks/tamil-language-cookbook.json", +# ] + +# #files generated from unit tests +# benchmarking_files.extend([ +# f"{ut_data_dir}/cookbooks/my-unit-test-cookbook.json", +# f"{ut_data_dir}/databases/my-new-cookbook.db", +# f"{ut_data_dir}/databases/my-new-recipe.db", +# f"{ut_data_dir}/databases/my-unit-test-cookbook.db", +# f"{ut_data_dir}/results/my-new-cookbook.json", +# f"{ut_data_dir}/results/my-unit-test-cookbook.json", +# f"{ut_data_dir}/results/my-unit-test-recipe.json", +# f"{ut_data_dir}/runners/my-new-cookbook.json", +# f"{ut_data_dir}/runners/my-unit-test-cookbook.json", +# f"{ut_data_dir}/runners/my-unit-test-recipe.json", +# f"{ut_data_dir}/recipes/my-unit-test-recipe.json", +# ]) +# for benchmarking_file in benchmarking_files: +# if os.path.exists(benchmarking_file): +# os.remove(benchmarking_file) + + +# test_recipe_id = "my-unit-test-recipe" +# test_cookbook_id = "my-unit-test-cookbook" +# err_unrecognised_arg = "Error: unrecognized arguments" +# err_missing_required_arg = "Error: the following arguments are required" + +# # ------------------------------------------------------------------------------ +# # Creation of files +# # ------------------------------------------------------------------------------ +# @pytest.mark.parametrize( +# "command_list, expected_output", +# [ +# # Success: Add with missing optional args +# ( +# ["add_recipe 'My unit test recipe' " +# "'hello world description?!' " +# "\"['category1','category2']\" " +# "\"['bbq-lite-age-ambiguous']\" " +# "\"['bertscore','bleuscore']\" " +# "-p \"['analogical-similarity','mmlu']\" " +# "-t \"['tag1','tag2']\" "], +# f"[add_recipe]: Recipe ({test_recipe_id}) created." +# ), + +# # Failure: Add with 1 missing required argument +# ( +# ["add_recipe 'My unit test recipe' " +# "'hello world description?!' " +# "\"['category1','category2']\" " +# "\"['bbq-lite-age-ambiguous']\" " +# "-p \"['analogical-similarity','mmlu']\" " +# "-t \"['tag1','tag2']\" " +# "-g \"{'A':[80,100],'B':[60,79],'C':[40,59],'D':[20,39],'E':[0,19]}\" "], +# err_missing_required_arg +# ), + +# # Failure: Add with missing required arguments +# ( +# ["add_recipe 'My unit test recipe' " +# "'hello world description?!' " +# "\"['category1','category2']\" " +# "-p \"['analogical-similarity','mmlu']\" " +# "-t \"['tag1','tag2']\" " +# "-g \"{'A':[80,100],'B':[60,79],'C':[40,59],'D':[20,39],'E':[0,19]}\" "], +# err_missing_required_arg +# ), + +# # Success: Help example +# ( +# ["add_recipe 'My unit test recipe' " +# "'hello world description?!' " +# "\"['category1','category2']\" " +# "\"['bbq-lite-age-ambiguous']\" " +# "\"['bertscore','bleuscore']\" " +# "-p \"['analogical-similarity','mmlu']\" " +# "-t \"['tag1','tag2']\" " +# "-g \"{'A':[80,100],'B':[60,79],'C':[40,59],'D':[20,39],'E':[0,19]}\" "], +# f"[add_recipe]: Recipe ({test_recipe_id}) created." +# ), + +# # Failure: Add with non-existent dataset +# ( +# ["add_recipe 'My unit test recipe' " +# "'hello world description?!' " +# "\"['category1','category2']\" " +# "\"['bbq-lite-age-ambiguous', 'bbq-lite-age-ambiguousx']\" " +# "\"['bertscore','bleuscore']\" " +# "-p \"['analogical-similarity','mmlu']\" " +# "-t \"['tag1','tag2']\" " +# "-g \"{'A':[80,100],'B':[60,79],'C':[40,59],'D':[20,39],'E':[0,19]}\" "], +# "Dataset bbq-lite-age-ambiguousx does not exist." +# ), + + +# # Failure: Add with non-existent metric +# ( +# ["add_recipe 'My unit test recipe' " +# "'hello world description?!' " +# "\"['category1','category2']\" " +# "\"['bbq-lite-age-ambiguous', 'bbq-lite-age-ambiguous']\" " +# "\"['bertscore','bleuscorex']\" " +# "-p \"['analogical-similarity','mmlu']\" " +# "-t \"['tag1','tag2']\" " +# "-g \"{'A':[80,100],'B':[60,79],'C':[40,59],'D':[20,39],'E':[0,19]}\" "], +# "Metric bleuscorex does not exist." +# ), + + +# # Failure: Add with non-existent prompt template +# ( +# ["add_recipe 'My unit test recipe' " +# "'hello world description?!' " +# "\"['category1','category2']\" " +# "\"['bbq-lite-age-ambiguous']\" " +# "\"['bertscore','bleuscore']\" " +# "-p \"['analogical-similarity','mmlux']\" " +# "-t \"['tag1','tag2']\" " +# "-g \"{'A':[80,100],'B':[60,79],'C':[40,59],'D':[20,39],'E':[0,19]}\" "], +# "Prompt Template mmlux does not exist." +# ), + +# # Failure: Add with incorrect parameter type for lists +# ( +# ["add_recipe 'My unit test recipe' " +# "'hello world description?!' " +# "\"['category1','category2']\" " +# "\"['bbq-lite-age-ambiguous']\" " +# "\"['bertscore','bleuscore']\" " +# "-p \"['analogical-similarity','mmlu']\" " +# "-t \"'tag1'\" " +# "-g \"{'A':[80,100],'B':[60,79],'C':[40,59],'D':[20,39],'E':[0,19]}\" "], +# "[add_recipe]: 1 validation error for api_create_recipe" +# ), + +# # Failure: Add with unknown flag +# ( +# ["add_recipe 'My unit test recipe' " +# "'hello world description?!' " +# "\"['category1','category2']\" " +# "\"['bbq-lite-age-ambiguous']\" " +# "\"['bertscore','bleuscore']\" " +# "-p \"['analogical-similarity','mmlu']\" " +# "-t \"['tag1','tag2']\" " +# "-g \"{'A':[80,100],'B':[60,79],'C':[40,59],'D':[20,39],'E':[0,19]}\" " +# "-x o"], +# err_unrecognised_arg +# ), +# ] +# ) +# def test_add_recipe(self, cli, command_list, expected_output, capsys): +# perform_assertion(cli, command_list, expected_output, capsys) + + # @pytest.mark.parametrize( + # "command_list, expected_output", + # [ + # # Success: Help example + # ( + # ["add_cookbook 'My unit test cookbook' 'hello world description?!' " + # "\"['arc']\""], + # "[add_cookbook]: Cookbook (my-unit-test-cookbook) created." + # ), + + # # Failure: Add with 1 missing required argument + # ( + # ["add_cookbook 'hello world description?!' \"['arc']\""], + # err_missing_required_arg + # ), + + # # Failure: Add with missing required arguments + # ( + # ["add_cookbook \"['arc']\""], + # err_missing_required_arg + # ), + + # # Failure: Add with incorrect parameter type for description + # ( + # ["add_cookbook 'My unit test cookbook' 'hello world description?!' " + # "\"'this is not a list!!'\""], + # "[add_cookbook]: 1 validation error for api_create_cookbook" + # ), + + # # Failure: Add with incorrect parameter type for recipe list + # ( + # ["add_cookbook 'My unit test cookbook' 'hello world description?!' " + # "\"'this is not a list!!'\""], + # "[add_cookbook]: 1 validation error for api_create_cookbook" + # ), + # # Failure: Add with non-existent recipe + # ( + # ["add_cookbook 'My unit test cookbook' 'hello world description?!' " + # "\"['auto-categorisatison']\""], + # "recipe does not exist." + # ), + + # # Failure: Add with unknown flag + # ( + # ["add_cookbook 'My unit test cookbook' 'hello world description?!' " + # "\"['arc']\" -n 1"], + # err_unrecognised_arg + # ), + # ], + # ) + # def test_add_cookbook(self, cli, command_list, expected_output, capsys): + # perform_assertion(cli, command_list, expected_output, capsys) + + # # ------------------------------------------------------------------------------ + # # Listing and viewing data + # # ------------------------------------------------------------------------------ + # # @pytest.mark.parametrize( + # # "command_list, expected_output", + # # [ + # # # Success: ID + # # ([f"view_recipe {test_recipe_id}"], "id: my-unit-test-recipe"), + + # # # Success: description + # # ([f"view_recipe {test_recipe_id}"], "hello world description?!"), + + # # # Success: tags + # # ([f"view_recipe {test_recipe_id}"], "1. tag1"), + # # ([f"view_recipe {test_recipe_id}"], "2. tag2"), + + # # # Success: categories + # # ([f"view_recipe {test_recipe_id}"], "1. category1"), + # # ([f"view_recipe {test_recipe_id}"], "2. category2"), + + # # # Success: grading scale + # # ([f"view_recipe {test_recipe_id}"], "A [80 - 100]"), + # # ([f"view_recipe {test_recipe_id}"], "B [60 - 79]"), + # # ([f"view_recipe {test_recipe_id}"], "C [40 - 59]"), + # # ([f"view_recipe {test_recipe_id}"], "D [20 - 39]"), + # # ([f"view_recipe {test_recipe_id}"], "E [0 - 19]"), + + # # # Success: dataset + # # ([f"view_recipe {test_recipe_id}"], "bbq-lite-age-ambiguous"), + + # # # # Success: prompt template + # # # ("analogical-similarity"), + # # # ("mmlu"), + + # # # # Success: metric + # # # ("bertscore"), + # # # ("bleuscore"), + + # # # # Success: attack strategies + # # # ("charswap_attack") + + # # # Failure: Test with unrecognised flag + # # ([f"view_recipe {test_recipe_id} -x o"], err_unrecognised_arg), + + # # # Failure: Test with non-existment recipe + # # ([f"view_recipe nope"], "[view_recipe]: No recipes found with ID") + # # ] + # # ) + # # def test_view_recipe(self, cli, command_list, expected_output, capsys): + # # perform_assertion(cli, command_list, expected_output, capsys) + + + # @pytest.mark.parametrize( + # "command_list, expected_output", + # [ + # # Success: No optional args + # ( + # ["list_recipes"], + # "bbq" + # ), + + # # Success: Find with results + # ( + # ["list_recipes -f bbq"], + # "bbq" + # ), + # # Success: Optional args with no results found + # ( + # ["list_recipes -f \"RandomArg\""], + # "There are no recipes found." + # ), + + # # Failure: List with unknown flag + # ( + # ["list_recipes -x test"], + # err_unrecognised_arg + # ), + # ] + # ) + # def test_list_recipes(self, cli, command_list, expected_output, capsys): + # perform_assertion(cli, command_list, expected_output, capsys) + + # @pytest.mark.parametrize( + # "function_args, expected_output", + # [ + # # Success: no results + # ("wrong_recipes", "There are no recipes found."), + + # # Success: results returned + # ("bbq", "bbq"), + # ] + # ) + # def test_list_recipes_output(self, function_args, expected_output, capsys): + # # additional function to test listing as the list command is hard to assert in CLI + # parser = argparse.ArgumentParser() + # parser.add_argument("-f", "--find", type=str, nargs="?") + # parser.add_argument("-p", "--pagination", type=str, nargs="?") + # args = parser.parse_args(['--find', function_args]) + + # returned_results = list_recipes(args) + # perform_assertion_function_output(expected_output, returned_results, capsys) + + # # def test_view_cookbook(self, cli): + # # pass + + # @pytest.mark.parametrize( + # "command_list, expected_output", + # [ + # # Success: No optional args + # ( + # ["list_cookbooks"], + # "chinese-safety-cookbook" + # ), + + # # Success: Find with results + # ( + # ["list_cookbooks -f tamil"], + # "tamil-language-cookbook" + # ), + # # Success: Optional args with no results found + # ( + # ["list_cookbooks -f \"RandomArg\""], + # "There are no cookbooks found." + # ), + + # # Failure: List with unknown flag + # ( + # ["list_cookbooks -x test"], + # err_unrecognised_arg + # ), + # ] + # ) + # def test_list_cookbooks(self, cli, command_list, expected_output, capsys): + # perform_assertion(cli, command_list, expected_output, capsys) + + # @pytest.mark.parametrize( + # "function_args, expected_output", + # [ + # # Success: no results + # ("no-such-cookbook", "There are no cookbooks found."), + + # # Success: results returned + # ("chinese", "chinese-safety-cookbook"), + # ] + # ) + # def test_list_cookbooks_output(self, function_args, expected_output, capsys): + # # additional function to test listing as the list command is hard to assert in CLI + # parser = argparse.ArgumentParser() + # parser.add_argument("-f", "--find", type=str, nargs="?") + # parser.add_argument("-p", "--pagination", type=str, nargs="?") + # args = parser.parse_args(['--find', function_args]) + + # returned_results = list_cookbooks(args) + # perform_assertion_function_output(expected_output, returned_results, capsys) + + # # def test_view_dataset(self, cli): + # # pass + + # @pytest.mark.parametrize( + # "command_list, expected_output", + # [ + # # Success: No optional args + # ( + # ["list_datasets"], + # "arc-easy" + # ), + + # # Success: Find with results + # ( + # ["list_datasets -f bbq"], + # "bbq-lite-age-disamb" + # ), + # # Success: Optional args with no results found + # ( + # ["list_datasets -f \"RandomArg\""], + # "There are no datasets found." + # ), + + # # Failure: List with unknown flag + # ( + # ["list_datasets -x test"], + # err_unrecognised_arg + # ), + # ] + # ) + # def test_list_datasets(self, cli, command_list, expected_output, capsys): + # perform_assertion(cli, command_list, expected_output, capsys) + + # @pytest.mark.parametrize( + # "function_args, expected_output", + # [ + # # Success: no results + # ("no-such-dataset", "There are no datasets found."), + + # # Success: results returned + # ("arc", "arc-easy"), + # ] + # ) + # def test_list_datasets_output(self, function_args, expected_output, capsys): + # # additional function to test listing as the list command is hard to assert in CLI + # parser = argparse.ArgumentParser() + # parser.add_argument("-f", "--find", type=str, nargs="?") + # parser.add_argument("-p", "--pagination", type=str, nargs="?") + # args = parser.parse_args(['--find', function_args]) + + # returned_results = list_datasets(args) + # perform_assertion_function_output(expected_output, returned_results, capsys) + + # # def test_view_metric(self, cli): + # # pass + + + # @pytest.mark.parametrize( + # "command_list, expected_output", + # [ + # # Success: No optional args + # ( + # ["list_metrics"], + # "bleuscore" + # ), + + # # Success: Find with results + # ( + # ["list_metrics -f bertscore"], + # "bertscore" + # ), + # # Success: Optional args with no results found + # ( + # ["list_metrics -f \"RandomArg\""], + # "There are no metrics found." + # ), + + # # Failure: List with unknown flag + # ( + # ["list_metrics -x test"], + # err_unrecognised_arg + # ), + # ] + # ) + # def test_list_metrics(self, cli, command_list, expected_output, capsys): + # perform_assertion(cli, command_list, expected_output, capsys) + + # @pytest.mark.parametrize( + # "function_args, expected_output", + # [ + # # Success: no results + # ("no-such-metrics", "There are no metrics found."), + + # # Success: results returned + # ("bert", "bertscore"), + # ] + # ) + # def test_list_metrics_output(self, function_args, expected_output, capsys): + # # additional function to test listing as the list command is hard to assert in CLI + # parser = argparse.ArgumentParser() + # parser.add_argument("-f", "--find", type=str, nargs="?") + # parser.add_argument("-p", "--pagination", type=str, nargs="?") + # args = parser.parse_args(['--find', function_args]) + + # returned_results = list_metrics(args) + # perform_assertion_function_output(expected_output, returned_results, capsys) + + # # ------------------------------------------------------------------------------ + # # Updating of files + # # ------------------------------------------------------------------------------ + # @pytest.mark.parametrize( + # "command_list, expected_output", + # [ + # # Success: Help example update with missing optional arguments + # (["add_recipe 'My unit test recipe' " + # "'hello world description?!' " + # "\"['category1','category2']\" " + # "\"['bbq-lite-age-ambiguous']\" " + # "\"['bertscore','bleuscore']\" " + # "-p \"['analogical-similarity','mmlu']\" " + # "-t \"['tag1','tag2']\" ", + # f"update_recipe {test_recipe_id} \"[('name', 'My Updated Recipe'), ('tags', ['fairness', 'bbq'])]\""], + # "[update_recipe]: Recipe updated." + # ), + + # # Success: Update every available key + # (["add_recipe 'My unit test recipe' " + # "'hello world description?!' " + # "\"['category1','category2']\" " + # "\"['bbq-lite-age-ambiguous']\" " + # "\"['bertscore','bleuscore']\" " + # "-p \"['analogical-similarity','mmlu']\" " + # "-t \"['tag1','tag2']\" ", + # f"update_recipe {test_recipe_id} \"[('name', 'My Updated Recipe2'), ('tags', ['updated tag']), " + # "('description', 'updated description'), ('categories', ['updated cat 1', 'updated cat 2']), " + # " ('datasets', ['bbq-lite-age-ambiguous']), ('prompt_templates', ['analogical-similarity', 'mmlu']), " + # " ('metrics', ['bleuscore']), ('attack_modules', ['charswap_attack']), " + # " ('grading_scale', {'New A':[75,100],'New B':[50,74],'New C':[25,49],'New D':[0,24]}) " + # "]\""], + # "[update_recipe]: Recipe updated." + # ), + + # # Failure: Update with some wrong parameter types + # (["add_recipe 'My unit test recipe' " + # "'hello world description?!' " + # "\"['category1','category2']\" " + # "\"['bbq-lite-age-ambiguous']\" " + # "\"['bertscore','bleuscore']\" " + # "-p \"['analogical-similarity','mmlu']\" " + # "-t \"['tag1','tag2']\" ", + # f"update_recipe {test_recipe_id} \"[('name', ['Name should not be a list']), ('tags', ['updated tag']), " + # " ('description', 'updated description'), ('categories', ['updated cat 1', 'updated cat 2']), " + # " ('datasets', ['bbq-lite-age-ambiguous']), ('prompt_templates', ['analogical-similarity', 'mmlu']), " + # " ('metrics', ['bleuscore']), ('attack_modules', ['charswap_attack']), " + # " ('grading_scale', [{'New A':[75,100],'New B':[50,74],'New C':[25,49],'New D':[0,24]}]) " + # "]\""], + # "[update_recipe]: 2 validation errors for RecipeArguments" + # ), + + # # Failure: Update with missing required argument + # ([f"update_recipe \"[('name', 'My Updated Recipe2'), ('tags', ['updated tag']), " + # "('description', 'updated description'), ('categories', ['updated cat 1', 'updated cat 2']), " + # " ('datasets', ['bbq-lite-age-ambiguous']), ('prompt_templates', ['analogical-similarity', 'mmlu']), " + # " ('metrics', ['bleuscore']), ('attack_modules', ['charswap_attack']), " + # " ('grading_scale', {'New A':[75,100],'New B':[50,74],'New C':[25,49],'New D':[0,24]}) " + # "]\""], + # err_missing_required_arg + # ), + + # # Failure: Update with non-existent dataset + # (["add_recipe 'My unit test recipe' " + # "'hello world description?!' " + # "\"['category1','category2']\" " + # "\"['bbq-lite-age-ambiguous']\" " + # "\"['bertscore','bleuscore']\" " + # "-p \"['analogical-similarity','mmlu']\" " + # "-t \"['tag1','tag2']\" ", + # f"update_recipe {test_recipe_id} \"[('name', 'My Updated Recipe2'), ('tags', ['updated tag']), " + # "('description', 'updated description'), ('categories', ['updated cat 1', 'updated cat 2']), " + # " ('datasets', ['nope']), ('prompt_templates', ['analogical-similarity', 'mmlu']), " + # " ('metrics', ['bleuscore']), ('attack_modules', ['charswap_attack']), " + # " ('grading_scale', {'New A':[75,100],'New B':[50,74],'New C':[25,49],'New D':[0,24]}) " + # "]\""], + # "Dataset nope does not exist." + # ), + + # # Failure: Update with non-existent metric + # (["add_recipe 'My unit test recipe' " + # "'hello world description?!' " + # "\"['category1','category2']\" " + # "\"['bbq-lite-age-ambiguous']\" " + # "\"['bertscore','bleuscore']\" " + # "-p \"['analogical-similarity','mmlu']\" " + # "-t \"['tag1','tag2']\" ", + # f"update_recipe {test_recipe_id} \"[('name', 'My Updated Recipe2'), ('tags', ['updated tag']), " + # "('description', 'updated description'), ('categories', ['updated cat 1', 'updated cat 2']), " + # " ('datasets', ['bbq-lite-age-ambiguous']), ('prompt_templates', ['analogical-similarity', 'mmlu']), " + # " ('metrics', ['nope']), ('attack_modules', ['charswap_attack']), " + # " ('grading_scale', {'New A':[75,100],'New B':[50,74],'New C':[25,49],'New D':[0,24]}) " + # "]\""], + # "Metric nope does not exist." + # ), + + # # Failure: Update with non-existent prompt template + # (["add_recipe 'My unit test recipe' " + # "'hello world description?!' " + # "\"['category1','category2']\" " + # "\"['bbq-lite-age-ambiguous']\" " + # "\"['bertscore','bleuscore']\" " + # "-p \"['analogical-similarity','mmlu']\" " + # "-t \"['tag1','tag2']\" ", + # f"update_recipe {test_recipe_id} \"[('name', 'My Updated Recipe2'), ('tags', ['updated tag']), " + # "('description', 'updated description'), ('categories', ['updated cat 1', 'updated cat 2']), " + # " ('datasets', ['bbq-lite-age-ambiguous']), ('prompt_templates', ['analogical-similarity', 'nope']), " + # " ('metrics', ['bleuscore']), ('attack_modules', ['charswap_attack']), " + # " ('grading_scale', {'New A':[75,100],'New B':[50,74],'New C':[25,49],'New D':[0,24]}) " + # "]\""], + # "Prompt Template nope does not exist." + # ), + + # # Failure: Update with unknown flag + # ([f"update_recipe {test_recipe_id} \"[('name', 'My Updated Recipe2'), ('tags', ['updated tag']), " + # "('description', 'updated description'), ('categories', ['updated cat 1', 'updated cat 2']), " + # " ('datasets', ['nope']), ('prompt_templates', ['analogical-similarity', 'mmlu']), " + # " ('metrics', ['bleuscore']), ('attack_modules', ['charswap_attack']), " + # " ('grading_scale', {'New A':[75,100],'New B':[50,74],'New C':[25,49],'New D':[0,24]}) " + # "]\" -x o"], + # err_unrecognised_arg + # ), + + # ] + # ) + # def test_update_recipe(self, cli, command_list, expected_output, capsys): + # perform_assertion(cli, command_list, expected_output, capsys) + + # @pytest.mark.parametrize( + # "command_list, expected_output", + # [ + # # Success: Help example + # (["add_cookbook 'My unit test cookbook' 'hello world description?!' " + # "\"['arc']\"", + # f"update_cookbook {test_cookbook_id} \"[('name', 'Updated cookbook name'), " + # "('description', 'Updated description'), ('recipes', ['arc'])]\""], + # "[update_cookbook]: Cookbook updated." + # ), + + # # Success: Update some keys + # (["add_cookbook 'My unit test cookbook' 'hello world description?!' " + # "\"['arc']\"", + # f"update_cookbook {test_cookbook_id} \"[('description', 'Updated cookbook description. again.')]\""], + # "[update_cookbook]: Cookbook updated." + # ), + + # # Failure: Update with some wrong parameter types + # (["add_cookbook 'My unit test cookbook' 'hello world description?!' " + # "\"['arc']\"", + # f"update_cookbook {test_cookbook_id} \"[('name', ['Updated cookbook name']), " + # "('description', 'Updated description'), ('recipes', ['arc'])]\""], + # "[update_cookbook]: 1 validation error for CookbookArguments" + # ), + + # # Failure: Update with missing required argument + # ([ + # f"update_cookbook \"\""], + # err_missing_required_arg + # ), + + # # # Failure: Update with unknown flag + # ([ + # f"update_cookbook {test_cookbook_id} \"[('name', 'Updated cookbook name'), " + # "('description', 'Updated description'), ('recipes', ['arc'])]\" -x o"], + # err_unrecognised_arg + # ), + # ] + # ) + # def test_update_cookbook(self, cli, command_list, expected_output, capsys): + # perform_assertion(cli, command_list, expected_output, capsys) + + + # # ------------------------------------------------------------------------------ + # # Running of recipes and cookbooks, and viewing the files generated (Commented out to not run the benchmarks. + # # Uncomment to run tests. Add in your token in the connector endpoints to run the tests) + # # ------------------------------------------------------------------------------ + + # @pytest.mark.parametrize( + # "command_list, expected_output", + # [ + # # # Success: Help example. + # # Uncomment this to run the actual benchmarking test with your own token + # # # Add in your own token also + # # ([f"run_recipe {test_recipe_id} \"['arc']\" \"['openai-gpt35-turbo']\" -n 1 -r 1 " + # # "-s \"You are an intelligent AI\""], + # # "Time taken to run" + # # ), + + # # # Failure: Run with non-existent recipes with new runner + # # ([f"run_recipe my_new_recipex \"['arc']\" \"['openai-gpt35-turbo']\" -n 1 -r 1 " + # # "-s \"You are an intelligent AI\""], + # # "No recipes found with ID" + # # ), + + # # Failure: Run with non-existent connector endpoint with new runner + # ([f"run_recipe my_new_recipe_two \"['arc']\" \"['openai-gpt35-turbox']\" -n 1 -r 1 " + # "-s \"You are an intelligent AI\""], + # "Connector endpoint openai-gpt35-turbox does not exist." + # ), + + # # Failure: Run with wrong type for optional arguments (input string instead of int) + # ([f"run_recipe my_new_recipe \"['arc']\" \"['openai-gpt35-turbox']\" -n x -r s " + # "-s \"You are an intelligent AI\""], + # "invalid int value" + # ), + + # # Failure: Run with unknown flag + # ([f"run_recipe my_new_recipe \"['arc']\" \"['openai-gpt35-turbox']\" -n 1 -r 1 " + # "-s \"You are an intelligent AI\" -x o"], + # err_unrecognised_arg + # ), + # ] + # ) + # def test_run_recipe(self, cli, command_list, expected_output, capsys): + # perform_assertion(cli, command_list, expected_output, capsys) + + + # @pytest.mark.parametrize( + # "command_list, expected_output", + # [ + # # # Success: Help example + # # ([f"run_cookbook {test_cookbook_id} \"['chinese-safety-cookbook']\" \"['openai-gpt35-turbo']\" -n 1 -r 1 " + # # "-s \"You are an intelligent AI\""], + # # "Time taken to run" + # # ), + + # # Failure: Run with non-existent cookbook + # # ([f"run_cookbook my_new_cookbook \"['chinese-safety-cookbookx']\" \"['openai-gpt35-turbo']\" -n 1 -r 1 " + # # "-s \"You are an intelligent AI\""], + # # "No cookbooks found with ID" + # # ), + + # # Failure: Run with non-existent connector endpoint with new runner + # ([f"run_cookbook my_new_cookbook_two \"['chinese-safety-cookbook']\" \"['openai-gpt35-turbox']\" -n 1 -r 1 " + # "-s \"You are an intelligent AI\""], + # "Connector endpoint openai-gpt35-turbox does not exist." + # ), + + + # # Failure: Run with wrong type for optional arguments (input string instead of int) + # ([f"run_cookbook my_new_cookbook \"['chinese-safety-cookbook']\" \"['openai-gpt35-turbo']\" -n x -r s " + # "-s \"You are an intelligent AI\""], + # "invalid int value" + # ), + + # # Failure: Run with unknown flag + # ([f"run_cookbook my_new_cookbook\"['chinese-safety-cookbook']\" \"['openai-gpt35-turbo']\" -n 1 -r 1 " + # "-s \"You are an intelligent AI\" -x o"], + # err_unrecognised_arg + # ), + # ] + # ) + # def test_run_cookbook(self, cli, command_list, expected_output, capsys): + # perform_assertion(cli, command_list, expected_output, capsys) + + # # def test_view_result(self, cli): + # # pass + + # @pytest.mark.parametrize( + # "command_list, expected_output", + # [ + # # Success: No optional args + # ( + # ["list_results"], + # "my-new-recipe-runner-result" + # ), + + # # Success: Find with results + # ( + # ["list_results -f sample-result"], + # "sample-result" + # ), + # # Success: Optional args with no results found + # ( + # ["list_results -f \"RandomArg\""], + # "There are no results found." + # ), + + # # Failure: List with unknown flag + # ( + # ["list_results -x test"], + # err_unrecognised_arg + # ), + # ] + # ) + # def test_list_results(self, cli, command_list, expected_output, capsys): + # perform_assertion(cli, command_list, expected_output, capsys) + + # @pytest.mark.parametrize( + # "function_args, expected_output", + # [ + # # Success: no results + # ("no-such-result", "There are no results found."), + + # # # Success: results returned + # # ("my-new-recipe-runner", "my-new-recipe-runner-result"), + # ] + # ) + # def test_list_results_output(self, function_args, expected_output, capsys): + # # additional function to test listing as the list command is hard to assert in CLI + # parser = argparse.ArgumentParser() + # parser.add_argument("-f", "--find", type=str, nargs="?") + # parser.add_argument("-p", "--pagination", type=str, nargs="?") + # args = parser.parse_args(['--find', function_args]) + + # returned_results = list_results(args) + # perform_assertion_function_output(expected_output, returned_results, capsys) + + # # def test_view_run(self, cli): + # # pass + + # @pytest.mark.parametrize( + # "command_list, expected_output", + # [ + # # Success: No optional args + # ( + # ["list_runs"], + # "my-new-recipe-runner" + # ), + + # # Success: Find with results + # ( + # ["list_runs -f my-new-recipe-runner"], + # "my-new-recipe-runner" + # ), + # # Success: Optional args with no results found + # ( + # ["list_runs -f \"RandomArg\""], + # "There are no runs found." + # ), + + # # Failure: List with unknown flag + # ( + # ["list_runs -x test"], + # err_unrecognised_arg + # ), + # ] + # ) + # def test_list_runs(self, cli, command_list, expected_output, capsys): + # perform_assertion(cli, command_list, expected_output, capsys) + + # @pytest.mark.parametrize( + # "function_args, expected_output", + # [ + # # Success: no results + # ("no-such-run", "There are no runs found."), + + # # # Success: results returned + # # ("my-new-recipe-runner", "my-new-recipe-runner"), + # ] + # ) + # def test_list_runs_output(self, function_args, expected_output, capsys): + # # additional function to test listing as the list command is hard to assert in CLI + # parser = argparse.ArgumentParser() + # parser.add_argument("-f", "--find", type=str, nargs="?") + # parser.add_argument("-p", "--pagination", type=str, nargs="?") + # args = parser.parse_args(['--find', function_args]) + + # returned_results = list_runs(args) + # perform_assertion_function_output(expected_output, returned_results, capsys) + + + # # def test_view_runner(self, cli): + # # pass + + + # @pytest.mark.parametrize( + # "command_list, expected_output", + # [ + # # Success: No optional args + # ( + # ["list_runners"], + # "my-new-recipe-runner" + # ), + + # # # Success: List runs with unknown flag will not have an error + # # because list_runners does not take in an arg (find will be implemented soon) + # ( + # ["list_runners -x test"], + # "my-new-recipe-runner" + # ), + # ] + # ) + # def test_list_runners(self, cli, command_list, expected_output, capsys): + # perform_assertion(cli, command_list, expected_output, capsys) + + + # # ------------------------------------------------------------------------------ + # # Deletion of files + # # ------------------------------------------------------------------------------ + # @pytest.mark.parametrize( + # "command_list, expected_output", + # [ + # # # Success: Delete existing recipe TOFIX + # # (f"delete_recipe {test_recipe_id}", "y", "[delete_recipe]: Recipe deleted."), + + # # Failure: Delete with missing argument + # ([f"delete_recipe"], err_missing_required_arg), + + # # Failure: Delete with unknown flag + # ([f"delete_recipe {test_recipe_id} -x o"], err_unrecognised_arg), + # ] + # ) + # def test_delete_session(self, cli, command_list, expected_output, capsys): + # perform_assertion(cli, command_list, expected_output, capsys) + + # @pytest.mark.parametrize( + # "command_list, expected_output", + # [ + # # # Success: Delete existing cookbook + # # (f"delete_cookbook {test_cookbook_id}", "y", "[delete_cookbook]: Cookbook deleted."), + + # # Failure: Delete with missing argument + # ([f"delete_cookbook"], err_missing_required_arg), + + # # Failure: Delete with unknown flag + # ([f"delete_cookbook {test_cookbook_id} -x o"], err_unrecognised_arg), + # ] + # ) + # def test_delete_cookbook(self, cli, command_list, expected_output, capsys): + # perform_assertion(cli, command_list, expected_output, capsys) + + # def test_delete_dataset(self, cli): + # pass + + # def test_delete_metrics(self, cli): + # pass + + # def test_delete_result(self, cli): + # pass + + # def test_delete_runner(self, cli): + # pass diff --git a/tests/unit-tests/cli/test_benchmarking.py b/tests/unit-tests/cli/test_benchmarking.py deleted file mode 100644 index 04ef9cf5..00000000 --- a/tests/unit-tests/cli/test_benchmarking.py +++ /dev/null @@ -1,1130 +0,0 @@ -from moonshot.integrations.cli.benchmark.datasets import list_datasets -from moonshot.integrations.cli.benchmark.metrics import list_metrics -from moonshot.integrations.cli.benchmark.result import list_results -from moonshot.integrations.cli.benchmark.run import list_runs -import pytest -from io import StringIO -from unittest.mock import patch -from moonshot.integrations.cli.cli import CommandLineInterface -from moonshot.api import api_set_environment_variables -import shutil -import os -import argparse - -from moonshot.integrations.cli.benchmark.recipe import list_recipes -from moonshot.integrations.cli.benchmark.cookbook import list_cookbooks - - -@pytest.fixture -def cli(): - return CommandLineInterface() - -def run_command(cli: CommandLineInterface, command_list: list = []): - for command in command_list: - cli.onecmd_plus_hooks(command) - -def run_command_table(cli, command): - with patch('sys.stdout', new_callable=StringIO) as mock_stdout: - cli.onecmd_plus_hooks(command) - return mock_stdout.getvalue() - -def perform_assertion(cli, command_list, expected_output, capsys): - run_command(cli, command_list) - captured = capsys.readouterr() - if captured.out: - assert captured.out.rstrip() == expected_output or expected_output in captured.out.rstrip() - else: - assert expected_output in captured.err.rstrip() - -def perform_assertion_function_output(expected_output, returned_results, capsys): - if returned_results: - assert any(expected_output in returned_result.values() for returned_result in returned_results) - else: - captured = capsys.readouterr() - if captured.out: - assert captured.out.rstrip() == expected_output or expected_output in captured.out.rstrip() - - -ut_data_dir = "tests/unit-tests/src/data" -ut_sample_dir = "tests/unit-tests/common/samples" - -class TestBenchmarkingCLI: - @pytest.fixture(autouse=True) - def init(self): - # Set environment variables for result paths - api_set_environment_variables( - { - "RUNNERS": f"{ut_data_dir}/runners/", - "DATABASES": f"{ut_data_dir}/databases/", - "DATABASES_MODULES": f"{ut_data_dir}/databases-modules/", - "DATASETS": f"{ut_data_dir}/datasets/", - "CONNECTORS": f"{ut_data_dir}/connectors/", - "CONNECTORS_ENDPOINTS": f"{ut_data_dir}/connectors-endpoints/", - "IO_MODULES": f"{ut_data_dir}/io-modules/", - "ATTACK_MODULES": f"{ut_data_dir}/attack-modules/", - "CONTEXT_STRATEGY": f"{ut_data_dir}/context-strategy/", - "COOKBOOKS": f"{ut_data_dir}/cookbooks/", - "METRICS": f"{ut_data_dir}/metrics/", - "PROMPT_TEMPLATES": f"{ut_data_dir}/prompt-templates/", - "RECIPES": f"{ut_data_dir}/recipes/", - "RUNNERS": f"{ut_data_dir}/runners/", - "RUNNERS_MODULES": f"{ut_data_dir}/runner-modules/", - "RESULTS_MODULES": f"{ut_data_dir}/results-modules/", - "RESULTS": f"{ut_data_dir}/results/", - } - ) - - # Copy cookbooks - shutil.copyfile( - f"{ut_sample_dir}/chinese-safety-cookbook.json", - f"{ut_data_dir}/cookbooks/chinese-safety-cookbook.json", - ) - - shutil.copyfile( - f"{ut_sample_dir}/tamil-language-cookbook.json", - f"{ut_data_dir}/cookbooks/tamil-language-cookbook.json", - ) - - # Copy recipes - shutil.copyfile( - f"{ut_sample_dir}/bbq.json", - f"{ut_data_dir}/recipes/bbq.json", - ) - shutil.copyfile( - f"{ut_sample_dir}/arc.json", - f"{ut_data_dir}/recipes/arc.json", - ) - - # Copy dataset - shutil.copyfile( - f"{ut_sample_dir}/bbq-lite-age-disamb.json", - f"{ut_data_dir}/datasets/bbq-lite-age-disamb.json", - ) - shutil.copyfile( - f"{ut_sample_dir}/bbq-lite-age-ambiguous.json", - f"{ut_data_dir}/datasets/bbq-lite-age-ambiguous.json", - ) - shutil.copyfile( - f"{ut_sample_dir}/arc-easy.json", - f"{ut_data_dir}/datasets/arc-easy.json", - ) - shutil.copyfile( - f"{ut_sample_dir}/arc-challenge.json", - f"{ut_data_dir}/datasets/arc-challenge.json", - ) - # Copy metrics - shutil.copyfile( - f"{ut_sample_dir}/bertscore.py", - f"{ut_data_dir}/metrics/bertscore.py", - ) - shutil.copyfile( - f"{ut_sample_dir}/bleuscore.py", - f"{ut_data_dir}/metrics/bleuscore.py", - ) - shutil.copyfile( - f"{ut_sample_dir}/exactstrmatch.py", - f"{ut_data_dir}/metrics/exactstrmatch.py", - ) - - # Copy prompt templates - shutil.copyfile( - f"{ut_sample_dir}/analogical-similarity.json", - f"{ut_data_dir}/prompt-templates/analogical-similarity.json", - ) - shutil.copyfile( - f"{ut_sample_dir}/mmlu.json", - f"{ut_data_dir}/prompt-templates/mmlu.json", - ) - shutil.copyfile( - f"{ut_sample_dir}/mcq-template.json", - f"{ut_data_dir}/prompt-templates/mcq-template.json", - ) - - # Copy attack modules - shutil.copyfile( - f"{ut_sample_dir}/charswap_attack.py", - f"{ut_data_dir}/attack-modules/charswap_attack.py", - ) - - # Copy connector - shutil.copyfile( - f"{ut_sample_dir}/openai-connector.py", - f"{ut_data_dir}/connectors/openai-connector.py", - ) - - # # Copy connector endpoint - # shutil.copyfile( - # f"{ut_sample_dir}/openai-gpt4.json", - # f"{ut_data_dir}/connectors-endpoints/openai-gpt4.json", - # ) - shutil.copyfile( - f"{ut_sample_dir}/openai-gpt35-turbo.json", - f"{ut_data_dir}/connectors-endpoints/openai-gpt35-turbo.json", - ) - - # Copy runner module - shutil.copyfile( - f"{ut_sample_dir}/benchmarking.py", - f"{ut_data_dir}/runner-modules/benchmarking.py", - ) - - # Copy results module - shutil.copyfile( - f"{ut_sample_dir}/benchmarking-result.py", - f"{ut_data_dir}/results-modules/benchmarking-result.py", - ) - - # Copy first sample runner - shutil.copyfile( - f"{ut_sample_dir}/my-new-recipe-runner.json", - f"{ut_data_dir}/runners/my-new-recipe-runner.json", - ) - - shutil.copyfile( - f"{ut_sample_dir}/my-new-recipe-runner.db", - f"{ut_data_dir}/databases/my-new-recipe-runner.db", - ) - - # Copy first sample result - shutil.copyfile( - f"{ut_sample_dir}/my-new-recipe-runner-result.json", - f"{ut_data_dir}/results/my-new-recipe-runner-result.json", - ) - - # Copy second sample result - shutil.copyfile( - f"{ut_sample_dir}/sample-result.json", - f"{ut_data_dir}/results/sample-result.json", - ) - - - # Setup complete, proceed with tests - yield - - benchmarking_files = [ - f"{ut_data_dir}/cookbooks/chinese-safety-cookbook.json", - f"{ut_data_dir}/recipes/bbq.json", - f"{ut_data_dir}/recipes/arc.json", - f"{ut_data_dir}/datasets/bbq-lite-age-disamb.json", - f"{ut_data_dir}/datasets/bbq-lite-age-ambiguous.json", - f"{ut_data_dir}/metrics/bertscore.py", - f"{ut_data_dir}/metrics/bleuscore.py", - f"{ut_data_dir}/prompt-templates/analogical-similarity.json", - f"{ut_data_dir}/prompt-templates/mmlu.json", - f"{ut_data_dir}/attack-modules/charswap_attack.py", - f"{ut_data_dir}/connectors/openai-connector.py", - # f"{ut_data_dir}/connectors-endpoints/openai-gpt4.json", - f"{ut_data_dir}/connectors-endpoints/openai-gpt35-turbo.json", - f"{ut_data_dir}/runner-modules/benchmarking.py", - f"{ut_data_dir}/results-modules/benchmarking-result.py", - f"{ut_data_dir}/datasets/arc-easy.json", - f"{ut_data_dir}/metrics/exactstrmatch.py", - f"{ut_data_dir}/prompt-templates/mcq-template.json", - f"{ut_data_dir}/datasets/arc-challenge.json", - f"{ut_data_dir}/runners/my-new-recipe-runner.json", - f"{ut_data_dir}/databases/my-new-recipe-runner.db", - f"{ut_data_dir}/runners/my-runner.json", - f"{ut_data_dir}/databases/my-runner.db", - f"{ut_data_dir}/results/my-new-recipe-runner-result.json", - f"{ut_data_dir}/results/sample-result.json", - f"{ut_data_dir}/cookbooks/tamil-language-cookbook.json", - ] - - #files generated from unit tests - benchmarking_files.extend([ - f"{ut_data_dir}/cookbooks/my-unit-test-cookbook.json", - f"{ut_data_dir}/databases/my-new-cookbook.db", - f"{ut_data_dir}/databases/my-new-recipe.db", - f"{ut_data_dir}/databases/my-unit-test-cookbook.db", - f"{ut_data_dir}/results/my-new-cookbook.json", - f"{ut_data_dir}/results/my-unit-test-cookbook.json", - f"{ut_data_dir}/results/my-unit-test-recipe.json", - f"{ut_data_dir}/runners/my-new-cookbook.json", - f"{ut_data_dir}/runners/my-unit-test-cookbook.json", - f"{ut_data_dir}/runners/my-unit-test-recipe.json", - f"{ut_data_dir}/recipes/my-unit-test-recipe.json", - ]) - for benchmarking_file in benchmarking_files: - if os.path.exists(benchmarking_file): - os.remove(benchmarking_file) - - - test_recipe_id = "my-unit-test-recipe" - test_cookbook_id = "my-unit-test-cookbook" - err_unrecognised_arg = "Error: unrecognized arguments" - err_missing_required_arg = "Error: the following arguments are required" - - # ------------------------------------------------------------------------------ - # Creation of files - # ------------------------------------------------------------------------------ - @pytest.mark.parametrize( - "command_list, expected_output", - [ - # Success: Add with missing optional args - ( - ["add_recipe 'My unit test recipe' " - "'hello world description?!' " - "\"['category1','category2']\" " - "\"['bbq-lite-age-ambiguous']\" " - "\"['bertscore','bleuscore']\" " - "-p \"['analogical-similarity','mmlu']\" " - "-t \"['tag1','tag2']\" "], - f"[add_recipe]: Recipe ({test_recipe_id}) created." - ), - - # Failure: Add with 1 missing required argument - ( - ["add_recipe 'My unit test recipe' " - "'hello world description?!' " - "\"['category1','category2']\" " - "\"['bbq-lite-age-ambiguous']\" " - "-p \"['analogical-similarity','mmlu']\" " - "-t \"['tag1','tag2']\" " - "-g \"{'A':[80,100],'B':[60,79],'C':[40,59],'D':[20,39],'E':[0,19]}\" "], - err_missing_required_arg - ), - - # Failure: Add with missing required arguments - ( - ["add_recipe 'My unit test recipe' " - "'hello world description?!' " - "\"['category1','category2']\" " - "-p \"['analogical-similarity','mmlu']\" " - "-t \"['tag1','tag2']\" " - "-g \"{'A':[80,100],'B':[60,79],'C':[40,59],'D':[20,39],'E':[0,19]}\" "], - err_missing_required_arg - ), - - # Success: Help example - ( - ["add_recipe 'My unit test recipe' " - "'hello world description?!' " - "\"['category1','category2']\" " - "\"['bbq-lite-age-ambiguous']\" " - "\"['bertscore','bleuscore']\" " - "-p \"['analogical-similarity','mmlu']\" " - "-t \"['tag1','tag2']\" " - "-g \"{'A':[80,100],'B':[60,79],'C':[40,59],'D':[20,39],'E':[0,19]}\" "], - f"[add_recipe]: Recipe ({test_recipe_id}) created." - ), - - # Failure: Add with non-existent dataset - ( - ["add_recipe 'My unit test recipe' " - "'hello world description?!' " - "\"['category1','category2']\" " - "\"['bbq-lite-age-ambiguous', 'bbq-lite-age-ambiguousx']\" " - "\"['bertscore','bleuscore']\" " - "-p \"['analogical-similarity','mmlu']\" " - "-t \"['tag1','tag2']\" " - "-g \"{'A':[80,100],'B':[60,79],'C':[40,59],'D':[20,39],'E':[0,19]}\" "], - "Dataset bbq-lite-age-ambiguousx does not exist." - ), - - - # Failure: Add with non-existent metric - ( - ["add_recipe 'My unit test recipe' " - "'hello world description?!' " - "\"['category1','category2']\" " - "\"['bbq-lite-age-ambiguous', 'bbq-lite-age-ambiguous']\" " - "\"['bertscore','bleuscorex']\" " - "-p \"['analogical-similarity','mmlu']\" " - "-t \"['tag1','tag2']\" " - "-g \"{'A':[80,100],'B':[60,79],'C':[40,59],'D':[20,39],'E':[0,19]}\" "], - "Metric bleuscorex does not exist." - ), - - - # Failure: Add with non-existent prompt template - ( - ["add_recipe 'My unit test recipe' " - "'hello world description?!' " - "\"['category1','category2']\" " - "\"['bbq-lite-age-ambiguous']\" " - "\"['bertscore','bleuscore']\" " - "-p \"['analogical-similarity','mmlux']\" " - "-t \"['tag1','tag2']\" " - "-g \"{'A':[80,100],'B':[60,79],'C':[40,59],'D':[20,39],'E':[0,19]}\" "], - "Prompt Template mmlux does not exist." - ), - - # Failure: Add with incorrect parameter type for lists - ( - ["add_recipe 'My unit test recipe' " - "'hello world description?!' " - "\"['category1','category2']\" " - "\"['bbq-lite-age-ambiguous']\" " - "\"['bertscore','bleuscore']\" " - "-p \"['analogical-similarity','mmlu']\" " - "-t \"'tag1'\" " - "-g \"{'A':[80,100],'B':[60,79],'C':[40,59],'D':[20,39],'E':[0,19]}\" "], - "[add_recipe]: 1 validation error for api_create_recipe" - ), - - # Failure: Add with unknown flag - ( - ["add_recipe 'My unit test recipe' " - "'hello world description?!' " - "\"['category1','category2']\" " - "\"['bbq-lite-age-ambiguous']\" " - "\"['bertscore','bleuscore']\" " - "-p \"['analogical-similarity','mmlu']\" " - "-t \"['tag1','tag2']\" " - "-g \"{'A':[80,100],'B':[60,79],'C':[40,59],'D':[20,39],'E':[0,19]}\" " - "-x o"], - err_unrecognised_arg - ), - ] - ) - def test_add_recipe(self, cli, command_list, expected_output, capsys): - perform_assertion(cli, command_list, expected_output, capsys) - - @pytest.mark.parametrize( - "command_list, expected_output", - [ - # Success: Help example - ( - ["add_cookbook 'My unit test cookbook' 'hello world description?!' " - "\"['arc']\""], - "[add_cookbook]: Cookbook (my-unit-test-cookbook) created." - ), - - # Failure: Add with 1 missing required argument - ( - ["add_cookbook 'hello world description?!' \"['arc']\""], - err_missing_required_arg - ), - - # Failure: Add with missing required arguments - ( - ["add_cookbook \"['arc']\""], - err_missing_required_arg - ), - - # Failure: Add with incorrect parameter type for description - ( - ["add_cookbook 'My unit test cookbook' 'hello world description?!' " - "\"'this is not a list!!'\""], - "[add_cookbook]: 1 validation error for api_create_cookbook" - ), - - # Failure: Add with incorrect parameter type for recipe list - ( - ["add_cookbook 'My unit test cookbook' 'hello world description?!' " - "\"'this is not a list!!'\""], - "[add_cookbook]: 1 validation error for api_create_cookbook" - ), - # Failure: Add with non-existent recipe - ( - ["add_cookbook 'My unit test cookbook' 'hello world description?!' " - "\"['auto-categorisatison']\""], - "recipe does not exist." - ), - - # Failure: Add with unknown flag - ( - ["add_cookbook 'My unit test cookbook' 'hello world description?!' " - "\"['arc']\" -n 1"], - err_unrecognised_arg - ), - ], - ) - def test_add_cookbook(self, cli, command_list, expected_output, capsys): - perform_assertion(cli, command_list, expected_output, capsys) - - # ------------------------------------------------------------------------------ - # Listing and viewing data - # ------------------------------------------------------------------------------ - # @pytest.mark.parametrize( - # "command_list, expected_output", - # [ - # # Success: ID - # ([f"view_recipe {test_recipe_id}"], "id: my-unit-test-recipe"), - - # # Success: description - # ([f"view_recipe {test_recipe_id}"], "hello world description?!"), - - # # Success: tags - # ([f"view_recipe {test_recipe_id}"], "1. tag1"), - # ([f"view_recipe {test_recipe_id}"], "2. tag2"), - - # # Success: categories - # ([f"view_recipe {test_recipe_id}"], "1. category1"), - # ([f"view_recipe {test_recipe_id}"], "2. category2"), - - # # Success: grading scale - # ([f"view_recipe {test_recipe_id}"], "A [80 - 100]"), - # ([f"view_recipe {test_recipe_id}"], "B [60 - 79]"), - # ([f"view_recipe {test_recipe_id}"], "C [40 - 59]"), - # ([f"view_recipe {test_recipe_id}"], "D [20 - 39]"), - # ([f"view_recipe {test_recipe_id}"], "E [0 - 19]"), - - # # Success: dataset - # ([f"view_recipe {test_recipe_id}"], "bbq-lite-age-ambiguous"), - - # # # Success: prompt template - # # ("analogical-similarity"), - # # ("mmlu"), - - # # # Success: metric - # # ("bertscore"), - # # ("bleuscore"), - - # # # Success: attack strategies - # # ("charswap_attack") - - # # Failure: Test with unrecognised flag - # ([f"view_recipe {test_recipe_id} -x o"], err_unrecognised_arg), - - # # Failure: Test with non-existment recipe - # ([f"view_recipe nope"], "[view_recipe]: No recipes found with ID") - # ] - # ) - # def test_view_recipe(self, cli, command_list, expected_output, capsys): - # perform_assertion(cli, command_list, expected_output, capsys) - - - @pytest.mark.parametrize( - "command_list, expected_output", - [ - # Success: No optional args - ( - ["list_recipes"], - "bbq" - ), - - # Success: Find with results - ( - ["list_recipes -f bbq"], - "bbq" - ), - # Success: Optional args with no results found - ( - ["list_recipes -f \"RandomArg\""], - "There are no recipes found." - ), - - # Failure: List with unknown flag - ( - ["list_recipes -x test"], - err_unrecognised_arg - ), - ] - ) - def test_list_recipes(self, cli, command_list, expected_output, capsys): - perform_assertion(cli, command_list, expected_output, capsys) - - @pytest.mark.parametrize( - "function_args, expected_output", - [ - # Success: no results - ("wrong_recipes", "There are no recipes found."), - - # Success: results returned - ("bbq", "bbq"), - ] - ) - def test_list_recipes_output(self, function_args, expected_output, capsys): - # additional function to test listing as the list command is hard to assert in CLI - parser = argparse.ArgumentParser() - parser.add_argument("-f", "--find", type=str, nargs="?") - parser.add_argument("-p", "--pagination", type=str, nargs="?") - args = parser.parse_args(['--find', function_args]) - - returned_results = list_recipes(args) - perform_assertion_function_output(expected_output, returned_results, capsys) - - # def test_view_cookbook(self, cli): - # pass - - @pytest.mark.parametrize( - "command_list, expected_output", - [ - # Success: No optional args - ( - ["list_cookbooks"], - "chinese-safety-cookbook" - ), - - # Success: Find with results - ( - ["list_cookbooks -f tamil"], - "tamil-language-cookbook" - ), - # Success: Optional args with no results found - ( - ["list_cookbooks -f \"RandomArg\""], - "There are no cookbooks found." - ), - - # Failure: List with unknown flag - ( - ["list_cookbooks -x test"], - err_unrecognised_arg - ), - ] - ) - def test_list_cookbooks(self, cli, command_list, expected_output, capsys): - perform_assertion(cli, command_list, expected_output, capsys) - - @pytest.mark.parametrize( - "function_args, expected_output", - [ - # Success: no results - ("no-such-cookbook", "There are no cookbooks found."), - - # Success: results returned - ("chinese", "chinese-safety-cookbook"), - ] - ) - def test_list_cookbooks_output(self, function_args, expected_output, capsys): - # additional function to test listing as the list command is hard to assert in CLI - parser = argparse.ArgumentParser() - parser.add_argument("-f", "--find", type=str, nargs="?") - parser.add_argument("-p", "--pagination", type=str, nargs="?") - args = parser.parse_args(['--find', function_args]) - - returned_results = list_cookbooks(args) - perform_assertion_function_output(expected_output, returned_results, capsys) - - # def test_view_dataset(self, cli): - # pass - - @pytest.mark.parametrize( - "command_list, expected_output", - [ - # Success: No optional args - ( - ["list_datasets"], - "arc-easy" - ), - - # Success: Find with results - ( - ["list_datasets -f bbq"], - "bbq-lite-age-disamb" - ), - # Success: Optional args with no results found - ( - ["list_datasets -f \"RandomArg\""], - "There are no datasets found." - ), - - # Failure: List with unknown flag - ( - ["list_datasets -x test"], - err_unrecognised_arg - ), - ] - ) - def test_list_datasets(self, cli, command_list, expected_output, capsys): - perform_assertion(cli, command_list, expected_output, capsys) - - @pytest.mark.parametrize( - "function_args, expected_output", - [ - # Success: no results - ("no-such-dataset", "There are no datasets found."), - - # Success: results returned - ("arc", "arc-easy"), - ] - ) - def test_list_datasets_output(self, function_args, expected_output, capsys): - # additional function to test listing as the list command is hard to assert in CLI - parser = argparse.ArgumentParser() - parser.add_argument("-f", "--find", type=str, nargs="?") - parser.add_argument("-p", "--pagination", type=str, nargs="?") - args = parser.parse_args(['--find', function_args]) - - returned_results = list_datasets(args) - perform_assertion_function_output(expected_output, returned_results, capsys) - - # def test_view_metric(self, cli): - # pass - - - @pytest.mark.parametrize( - "command_list, expected_output", - [ - # Success: No optional args - ( - ["list_metrics"], - "bleuscore" - ), - - # Success: Find with results - ( - ["list_metrics -f bertscore"], - "bertscore" - ), - # Success: Optional args with no results found - ( - ["list_metrics -f \"RandomArg\""], - "There are no metrics found." - ), - - # Failure: List with unknown flag - ( - ["list_metrics -x test"], - err_unrecognised_arg - ), - ] - ) - def test_list_metrics(self, cli, command_list, expected_output, capsys): - perform_assertion(cli, command_list, expected_output, capsys) - - @pytest.mark.parametrize( - "function_args, expected_output", - [ - # Success: no results - ("no-such-metrics", "There are no metrics found."), - - # Success: results returned - ("bert", "bertscore"), - ] - ) - def test_list_metrics_output(self, function_args, expected_output, capsys): - # additional function to test listing as the list command is hard to assert in CLI - parser = argparse.ArgumentParser() - parser.add_argument("-f", "--find", type=str, nargs="?") - parser.add_argument("-p", "--pagination", type=str, nargs="?") - args = parser.parse_args(['--find', function_args]) - - returned_results = list_metrics(args) - perform_assertion_function_output(expected_output, returned_results, capsys) - - # ------------------------------------------------------------------------------ - # Updating of files - # ------------------------------------------------------------------------------ - @pytest.mark.parametrize( - "command_list, expected_output", - [ - # Success: Help example update with missing optional arguments - (["add_recipe 'My unit test recipe' " - "'hello world description?!' " - "\"['category1','category2']\" " - "\"['bbq-lite-age-ambiguous']\" " - "\"['bertscore','bleuscore']\" " - "-p \"['analogical-similarity','mmlu']\" " - "-t \"['tag1','tag2']\" ", - f"update_recipe {test_recipe_id} \"[('name', 'My Updated Recipe'), ('tags', ['fairness', 'bbq'])]\""], - "[update_recipe]: Recipe updated." - ), - - # Success: Update every available key - (["add_recipe 'My unit test recipe' " - "'hello world description?!' " - "\"['category1','category2']\" " - "\"['bbq-lite-age-ambiguous']\" " - "\"['bertscore','bleuscore']\" " - "-p \"['analogical-similarity','mmlu']\" " - "-t \"['tag1','tag2']\" ", - f"update_recipe {test_recipe_id} \"[('name', 'My Updated Recipe2'), ('tags', ['updated tag']), " - "('description', 'updated description'), ('categories', ['updated cat 1', 'updated cat 2']), " - " ('datasets', ['bbq-lite-age-ambiguous']), ('prompt_templates', ['analogical-similarity', 'mmlu']), " - " ('metrics', ['bleuscore']), ('attack_modules', ['charswap_attack']), " - " ('grading_scale', {'New A':[75,100],'New B':[50,74],'New C':[25,49],'New D':[0,24]}) " - "]\""], - "[update_recipe]: Recipe updated." - ), - - # Failure: Update with some wrong parameter types - (["add_recipe 'My unit test recipe' " - "'hello world description?!' " - "\"['category1','category2']\" " - "\"['bbq-lite-age-ambiguous']\" " - "\"['bertscore','bleuscore']\" " - "-p \"['analogical-similarity','mmlu']\" " - "-t \"['tag1','tag2']\" ", - f"update_recipe {test_recipe_id} \"[('name', ['Name should not be a list']), ('tags', ['updated tag']), " - " ('description', 'updated description'), ('categories', ['updated cat 1', 'updated cat 2']), " - " ('datasets', ['bbq-lite-age-ambiguous']), ('prompt_templates', ['analogical-similarity', 'mmlu']), " - " ('metrics', ['bleuscore']), ('attack_modules', ['charswap_attack']), " - " ('grading_scale', [{'New A':[75,100],'New B':[50,74],'New C':[25,49],'New D':[0,24]}]) " - "]\""], - "[update_recipe]: 2 validation errors for RecipeArguments" - ), - - # Failure: Update with missing required argument - ([f"update_recipe \"[('name', 'My Updated Recipe2'), ('tags', ['updated tag']), " - "('description', 'updated description'), ('categories', ['updated cat 1', 'updated cat 2']), " - " ('datasets', ['bbq-lite-age-ambiguous']), ('prompt_templates', ['analogical-similarity', 'mmlu']), " - " ('metrics', ['bleuscore']), ('attack_modules', ['charswap_attack']), " - " ('grading_scale', {'New A':[75,100],'New B':[50,74],'New C':[25,49],'New D':[0,24]}) " - "]\""], - err_missing_required_arg - ), - - # Failure: Update with non-existent dataset - (["add_recipe 'My unit test recipe' " - "'hello world description?!' " - "\"['category1','category2']\" " - "\"['bbq-lite-age-ambiguous']\" " - "\"['bertscore','bleuscore']\" " - "-p \"['analogical-similarity','mmlu']\" " - "-t \"['tag1','tag2']\" ", - f"update_recipe {test_recipe_id} \"[('name', 'My Updated Recipe2'), ('tags', ['updated tag']), " - "('description', 'updated description'), ('categories', ['updated cat 1', 'updated cat 2']), " - " ('datasets', ['nope']), ('prompt_templates', ['analogical-similarity', 'mmlu']), " - " ('metrics', ['bleuscore']), ('attack_modules', ['charswap_attack']), " - " ('grading_scale', {'New A':[75,100],'New B':[50,74],'New C':[25,49],'New D':[0,24]}) " - "]\""], - "Dataset nope does not exist." - ), - - # Failure: Update with non-existent metric - (["add_recipe 'My unit test recipe' " - "'hello world description?!' " - "\"['category1','category2']\" " - "\"['bbq-lite-age-ambiguous']\" " - "\"['bertscore','bleuscore']\" " - "-p \"['analogical-similarity','mmlu']\" " - "-t \"['tag1','tag2']\" ", - f"update_recipe {test_recipe_id} \"[('name', 'My Updated Recipe2'), ('tags', ['updated tag']), " - "('description', 'updated description'), ('categories', ['updated cat 1', 'updated cat 2']), " - " ('datasets', ['bbq-lite-age-ambiguous']), ('prompt_templates', ['analogical-similarity', 'mmlu']), " - " ('metrics', ['nope']), ('attack_modules', ['charswap_attack']), " - " ('grading_scale', {'New A':[75,100],'New B':[50,74],'New C':[25,49],'New D':[0,24]}) " - "]\""], - "Metric nope does not exist." - ), - - # Failure: Update with non-existent prompt template - (["add_recipe 'My unit test recipe' " - "'hello world description?!' " - "\"['category1','category2']\" " - "\"['bbq-lite-age-ambiguous']\" " - "\"['bertscore','bleuscore']\" " - "-p \"['analogical-similarity','mmlu']\" " - "-t \"['tag1','tag2']\" ", - f"update_recipe {test_recipe_id} \"[('name', 'My Updated Recipe2'), ('tags', ['updated tag']), " - "('description', 'updated description'), ('categories', ['updated cat 1', 'updated cat 2']), " - " ('datasets', ['bbq-lite-age-ambiguous']), ('prompt_templates', ['analogical-similarity', 'nope']), " - " ('metrics', ['bleuscore']), ('attack_modules', ['charswap_attack']), " - " ('grading_scale', {'New A':[75,100],'New B':[50,74],'New C':[25,49],'New D':[0,24]}) " - "]\""], - "Prompt Template nope does not exist." - ), - - # Failure: Update with unknown flag - ([f"update_recipe {test_recipe_id} \"[('name', 'My Updated Recipe2'), ('tags', ['updated tag']), " - "('description', 'updated description'), ('categories', ['updated cat 1', 'updated cat 2']), " - " ('datasets', ['nope']), ('prompt_templates', ['analogical-similarity', 'mmlu']), " - " ('metrics', ['bleuscore']), ('attack_modules', ['charswap_attack']), " - " ('grading_scale', {'New A':[75,100],'New B':[50,74],'New C':[25,49],'New D':[0,24]}) " - "]\" -x o"], - err_unrecognised_arg - ), - - ] - ) - def test_update_recipe(self, cli, command_list, expected_output, capsys): - perform_assertion(cli, command_list, expected_output, capsys) - - @pytest.mark.parametrize( - "command_list, expected_output", - [ - # Success: Help example - (["add_cookbook 'My unit test cookbook' 'hello world description?!' " - "\"['arc']\"", - f"update_cookbook {test_cookbook_id} \"[('name', 'Updated cookbook name'), " - "('description', 'Updated description'), ('recipes', ['arc'])]\""], - "[update_cookbook]: Cookbook updated." - ), - - # Success: Update some keys - (["add_cookbook 'My unit test cookbook' 'hello world description?!' " - "\"['arc']\"", - f"update_cookbook {test_cookbook_id} \"[('description', 'Updated cookbook description. again.')]\""], - "[update_cookbook]: Cookbook updated." - ), - - # Failure: Update with some wrong parameter types - (["add_cookbook 'My unit test cookbook' 'hello world description?!' " - "\"['arc']\"", - f"update_cookbook {test_cookbook_id} \"[('name', ['Updated cookbook name']), " - "('description', 'Updated description'), ('recipes', ['arc'])]\""], - "[update_cookbook]: 1 validation error for CookbookArguments" - ), - - # Failure: Update with missing required argument - ([ - f"update_cookbook \"\""], - err_missing_required_arg - ), - - # # Failure: Update with unknown flag - ([ - f"update_cookbook {test_cookbook_id} \"[('name', 'Updated cookbook name'), " - "('description', 'Updated description'), ('recipes', ['arc'])]\" -x o"], - err_unrecognised_arg - ), - ] - ) - def test_update_cookbook(self, cli, command_list, expected_output, capsys): - perform_assertion(cli, command_list, expected_output, capsys) - - - # ------------------------------------------------------------------------------ - # Running of recipes and cookbooks, and viewing the files generated (Commented out to not run the benchmarks. - # Uncomment to run tests. Add in your token in the connector endpoints to run the tests) - # ------------------------------------------------------------------------------ - - @pytest.mark.parametrize( - "command_list, expected_output", - [ - # # Success: Help example. - # Uncomment this to run the actual benchmarking test with your own token - # # Add in your own token also - # ([f"run_recipe {test_recipe_id} \"['arc']\" \"['openai-gpt35-turbo']\" -n 1 -r 1 " - # "-s \"You are an intelligent AI\""], - # "Time taken to run" - # ), - - # # Failure: Run with non-existent recipes with new runner - # ([f"run_recipe my_new_recipex \"['arc']\" \"['openai-gpt35-turbo']\" -n 1 -r 1 " - # "-s \"You are an intelligent AI\""], - # "No recipes found with ID" - # ), - - # Failure: Run with non-existent connector endpoint with new runner - ([f"run_recipe my_new_recipe_two \"['arc']\" \"['openai-gpt35-turbox']\" -n 1 -r 1 " - "-s \"You are an intelligent AI\""], - "Connector endpoint openai-gpt35-turbox does not exist." - ), - - # Failure: Run with wrong type for optional arguments (input string instead of int) - ([f"run_recipe my_new_recipe \"['arc']\" \"['openai-gpt35-turbox']\" -n x -r s " - "-s \"You are an intelligent AI\""], - "invalid int value" - ), - - # Failure: Run with unknown flag - ([f"run_recipe my_new_recipe \"['arc']\" \"['openai-gpt35-turbox']\" -n 1 -r 1 " - "-s \"You are an intelligent AI\" -x o"], - err_unrecognised_arg - ), - ] - ) - def test_run_recipe(self, cli, command_list, expected_output, capsys): - perform_assertion(cli, command_list, expected_output, capsys) - - - @pytest.mark.parametrize( - "command_list, expected_output", - [ - # # Success: Help example - # ([f"run_cookbook {test_cookbook_id} \"['chinese-safety-cookbook']\" \"['openai-gpt35-turbo']\" -n 1 -r 1 " - # "-s \"You are an intelligent AI\""], - # "Time taken to run" - # ), - - # Failure: Run with non-existent cookbook - # ([f"run_cookbook my_new_cookbook \"['chinese-safety-cookbookx']\" \"['openai-gpt35-turbo']\" -n 1 -r 1 " - # "-s \"You are an intelligent AI\""], - # "No cookbooks found with ID" - # ), - - # Failure: Run with non-existent connector endpoint with new runner - ([f"run_cookbook my_new_cookbook_two \"['chinese-safety-cookbook']\" \"['openai-gpt35-turbox']\" -n 1 -r 1 " - "-s \"You are an intelligent AI\""], - "Connector endpoint openai-gpt35-turbox does not exist." - ), - - - # Failure: Run with wrong type for optional arguments (input string instead of int) - ([f"run_cookbook my_new_cookbook \"['chinese-safety-cookbook']\" \"['openai-gpt35-turbo']\" -n x -r s " - "-s \"You are an intelligent AI\""], - "invalid int value" - ), - - # Failure: Run with unknown flag - ([f"run_cookbook my_new_cookbook\"['chinese-safety-cookbook']\" \"['openai-gpt35-turbo']\" -n 1 -r 1 " - "-s \"You are an intelligent AI\" -x o"], - err_unrecognised_arg - ), - ] - ) - def test_run_cookbook(self, cli, command_list, expected_output, capsys): - perform_assertion(cli, command_list, expected_output, capsys) - - # def test_view_result(self, cli): - # pass - - @pytest.mark.parametrize( - "command_list, expected_output", - [ - # Success: No optional args - ( - ["list_results"], - "my-new-recipe-runner-result" - ), - - # Success: Find with results - ( - ["list_results -f sample-result"], - "sample-result" - ), - # Success: Optional args with no results found - ( - ["list_results -f \"RandomArg\""], - "There are no results found." - ), - - # Failure: List with unknown flag - ( - ["list_results -x test"], - err_unrecognised_arg - ), - ] - ) - def test_list_results(self, cli, command_list, expected_output, capsys): - perform_assertion(cli, command_list, expected_output, capsys) - - @pytest.mark.parametrize( - "function_args, expected_output", - [ - # Success: no results - ("no-such-result", "There are no results found."), - - # # Success: results returned - # ("my-new-recipe-runner", "my-new-recipe-runner-result"), - ] - ) - def test_list_results_output(self, function_args, expected_output, capsys): - # additional function to test listing as the list command is hard to assert in CLI - parser = argparse.ArgumentParser() - parser.add_argument("-f", "--find", type=str, nargs="?") - parser.add_argument("-p", "--pagination", type=str, nargs="?") - args = parser.parse_args(['--find', function_args]) - - returned_results = list_results(args) - perform_assertion_function_output(expected_output, returned_results, capsys) - - # def test_view_run(self, cli): - # pass - - @pytest.mark.parametrize( - "command_list, expected_output", - [ - # Success: No optional args - ( - ["list_runs"], - "my-new-recipe-runner" - ), - - # Success: Find with results - ( - ["list_runs -f my-new-recipe-runner"], - "my-new-recipe-runner" - ), - # Success: Optional args with no results found - ( - ["list_runs -f \"RandomArg\""], - "There are no runs found." - ), - - # Failure: List with unknown flag - ( - ["list_runs -x test"], - err_unrecognised_arg - ), - ] - ) - def test_list_runs(self, cli, command_list, expected_output, capsys): - perform_assertion(cli, command_list, expected_output, capsys) - - @pytest.mark.parametrize( - "function_args, expected_output", - [ - # Success: no results - ("no-such-run", "There are no runs found."), - - # # Success: results returned - # ("my-new-recipe-runner", "my-new-recipe-runner"), - ] - ) - def test_list_runs_output(self, function_args, expected_output, capsys): - # additional function to test listing as the list command is hard to assert in CLI - parser = argparse.ArgumentParser() - parser.add_argument("-f", "--find", type=str, nargs="?") - parser.add_argument("-p", "--pagination", type=str, nargs="?") - args = parser.parse_args(['--find', function_args]) - - returned_results = list_runs(args) - perform_assertion_function_output(expected_output, returned_results, capsys) - - - # def test_view_runner(self, cli): - # pass - - - @pytest.mark.parametrize( - "command_list, expected_output", - [ - # Success: No optional args - ( - ["list_runners"], - "my-new-recipe-runner" - ), - - # # Success: List runs with unknown flag will not have an error - # because list_runners does not take in an arg (find will be implemented soon) - ( - ["list_runners -x test"], - "my-new-recipe-runner" - ), - ] - ) - def test_list_runners(self, cli, command_list, expected_output, capsys): - perform_assertion(cli, command_list, expected_output, capsys) - - - # # ------------------------------------------------------------------------------ - # # Deletion of files - # # ------------------------------------------------------------------------------ - # @pytest.mark.parametrize( - # "command_list, expected_output", - # [ - # # # Success: Delete existing recipe TOFIX - # # (f"delete_recipe {test_recipe_id}", "y", "[delete_recipe]: Recipe deleted."), - - # # Failure: Delete with missing argument - # ([f"delete_recipe"], err_missing_required_arg), - - # # Failure: Delete with unknown flag - # ([f"delete_recipe {test_recipe_id} -x o"], err_unrecognised_arg), - # ] - # ) - # def test_delete_session(self, cli, command_list, expected_output, capsys): - # perform_assertion(cli, command_list, expected_output, capsys) - - # @pytest.mark.parametrize( - # "command_list, expected_output", - # [ - # # # Success: Delete existing cookbook - # # (f"delete_cookbook {test_cookbook_id}", "y", "[delete_cookbook]: Cookbook deleted."), - - # # Failure: Delete with missing argument - # ([f"delete_cookbook"], err_missing_required_arg), - - # # Failure: Delete with unknown flag - # ([f"delete_cookbook {test_cookbook_id} -x o"], err_unrecognised_arg), - # ] - # ) - # def test_delete_cookbook(self, cli, command_list, expected_output, capsys): - # perform_assertion(cli, command_list, expected_output, capsys) - - # def test_delete_dataset(self, cli): - # pass - - # def test_delete_metrics(self, cli): - # pass - - # def test_delete_result(self, cli): - # pass - - # def test_delete_runner(self, cli): - # pass diff --git a/tests/unit-tests/cli/test_cookbook.py b/tests/unit-tests/cli/test_cookbook.py new file mode 100644 index 00000000..3d9194dc --- /dev/null +++ b/tests/unit-tests/cli/test_cookbook.py @@ -0,0 +1,1829 @@ +from ast import literal_eval +from unittest.mock import AsyncMock, MagicMock, patch + +import pytest + +from moonshot.integrations.cli.benchmark.cookbook import ( + add_cookbook, + delete_cookbook, + list_cookbooks, + run_cookbook, + update_cookbook, + view_cookbook, +) + + +class TestCollectionCliCookbook: + api_response = [ + { + "id": 1, + "name": "Cookbook 1", + "description": "Desc 1", + "recipes": ["recipe1"], + } + ] + api_response_pagination = [ + { + "id": 1, + "name": "Cookbook 1", + "description": "Desc 1", + "recipes": ["recipe1"], + "idx": 1, + } + ] + + @pytest.fixture(autouse=True) + def init(self): + # Perform tests + yield + + # ------------------------------------------------------------------------------ + # Test add_cookbook functionality + # ------------------------------------------------------------------------------ + @pytest.mark.parametrize( + "name, description, recipes, expected_output", + [ + # Valid case + ( + "Test Cookbook", + "This is a test cookbook.", + "['recipe1', 'recipe2']", + "[add_cookbook]: Cookbook (new_cookbook_id) created.", + ), + ( + "Another Cookbook", + "Another description.", + "['recipe3']", + "[add_cookbook]: Cookbook (new_cookbook_id) created.", + ), + # Invalid case for name + ( + None, + "This is a test cookbook.", + "['recipe1', 'recipe2']", + "[add_cookbook]: The 'name' argument must be a non-empty string and not None.", + ), + ( + "", + "This is a test cookbook.", + "['recipe1', 'recipe2']", + "[add_cookbook]: The 'name' argument must be a non-empty string and not None.", + ), + ( + 99, + "This is a test cookbook.", + "['recipe1', 'recipe2']", + "[add_cookbook]: The 'name' argument must be a non-empty string and not None.", + ), + ( + {}, + "This is a test cookbook.", + "['recipe1', 'recipe2']", + "[add_cookbook]: The 'name' argument must be a non-empty string and not None.", + ), + ( + [], + "This is a test cookbook.", + "['recipe1', 'recipe2']", + "[add_cookbook]: The 'name' argument must be a non-empty string and not None.", + ), + ( + (), + "This is a test cookbook.", + "['recipe1', 'recipe2']", + "[add_cookbook]: The 'name' argument must be a non-empty string and not None.", + ), + ( + True, + "This is a test cookbook.", + "['recipe1', 'recipe2']", + "[add_cookbook]: The 'name' argument must be a non-empty string and not None.", + ), + # Invalid case for description + ( + "Test Cookbook", + None, + "['recipe1', 'recipe2']", + "[add_cookbook]: The 'description' argument must be a non-empty string and not None.", + ), + ( + "Test Cookbook", + "", + "['recipe1', 'recipe2']", + "[add_cookbook]: The 'description' argument must be a non-empty string and not None.", + ), + ( + "Test Cookbook", + 99, + "['recipe1', 'recipe2']", + "[add_cookbook]: The 'description' argument must be a non-empty string and not None.", + ), + ( + "Test Cookbook", + {}, + "['recipe1', 'recipe2']", + "[add_cookbook]: The 'description' argument must be a non-empty string and not None.", + ), + ( + "Test Cookbook", + [], + "['recipe1', 'recipe2']", + "[add_cookbook]: The 'description' argument must be a non-empty string and not None.", + ), + ( + "Test Cookbook", + (), + "['recipe1', 'recipe2']", + "[add_cookbook]: The 'description' argument must be a non-empty string and not None.", + ), + ( + "Test Cookbook", + True, + "['recipe1', 'recipe2']", + "[add_cookbook]: The 'description' argument must be a non-empty string and not None.", + ), + # Invalid case for recipes - not a list of strings + ( + "Test Cookbook", + "This is a test cookbook.", + "None", + "[add_cookbook]: The 'recipes' argument must be a list of strings after evaluation.", + ), + ( + "Test Cookbook", + "This is a test cookbook.", + "[123, 'recipe2']", + "[add_cookbook]: The 'recipes' argument must be a list of strings after evaluation.", + ), + # Invalid case for recipes + ( + "Test Cookbook", + "This is a test cookbook.", + None, + "[add_cookbook]: The 'recipes' argument must be a non-empty string and not None.", + ), + ( + "Test Cookbook", + "This is a test cookbook.", + "", + "[add_cookbook]: The 'recipes' argument must be a non-empty string and not None.", + ), + ( + "Test Cookbook", + "This is a test cookbook.", + 99, + "[add_cookbook]: The 'recipes' argument must be a non-empty string and not None.", + ), + ( + "Test Cookbook", + "This is a test cookbook.", + {}, + "[add_cookbook]: The 'recipes' argument must be a non-empty string and not None.", + ), + ( + "Test Cookbook", + "This is a test cookbook.", + [], + "[add_cookbook]: The 'recipes' argument must be a non-empty string and not None.", + ), + ( + "Test Cookbook", + "This is a test cookbook.", + (), + "[add_cookbook]: The 'recipes' argument must be a non-empty string and not None.", + ), + ( + "Test Cookbook", + "This is a test cookbook.", + True, + "[add_cookbook]: The 'recipes' argument must be a non-empty string and not None.", + ), + # Exception case + ( + "Test Cookbook", + "This is a test cookbook.", + "['recipe1', 'recipe2']", + "[add_cookbook]: An error has occurred while creating cookbook.", + ), + ], + ) + @patch("moonshot.integrations.cli.benchmark.cookbook.api_create_cookbook") + def test_add_cookbook( + self, + mock_api_create_cookbook, + name, + description, + recipes, + expected_output, + capsys, + ): + if "error" in expected_output: + mock_api_create_cookbook.side_effect = Exception( + "An error has occurred while creating cookbook." + ) + else: + mock_api_create_cookbook.return_value = "new_cookbook_id" + + class Args: + pass + + args = Args() + args.name = name + args.description = description + args.recipes = recipes + + add_cookbook(args) + + captured = capsys.readouterr() + assert expected_output == captured.out.strip() + + if ( + isinstance(name, str) + and name + and isinstance(description, str) + and description + and isinstance(recipes, str) + and recipes + ): + try: + recipes_list = literal_eval(recipes) + if not ( + isinstance(recipes_list, list) + and all(isinstance(recipe, str) for recipe in recipes_list) + ): + raise ValueError( + "The 'recipes' argument must be a list of strings after evaluation." + ) + except Exception: + recipes_list = None + if recipes_list is not None: + mock_api_create_cookbook.assert_called_once_with( + name, description, recipes_list + ) + else: + mock_api_create_cookbook.assert_not_called() + else: + mock_api_create_cookbook.assert_not_called() + + # ------------------------------------------------------------------------------ + # Test list_cookbooks functionality with non-mocked filter-data + # ------------------------------------------------------------------------------ + @pytest.mark.parametrize( + "find, pagination, api_response, expected_output, expected_log, to_be_called", + [ + # Valid cases + ( + None, + None, + api_response, + api_response, + "", + True, + ), + # No cookbooks + (None, None, [], None, "There are no cookbooks found.", False), + ( + "Cookbook", + None, + api_response, + api_response, + "", + True, + ), + ( + None, + "(1, 1)", + api_response, + api_response_pagination, + "", + True, + ), + ("Cookbook", "(1, 1)", [], None, "There are no cookbooks found.", False), + # Invalid cases for find + ( + "", + None, + api_response, + None, + "[list_cookbooks]: The 'find' argument must be a non-empty string and not None.", + False, + ), + ( + 99, + None, + api_response, + None, + "[list_cookbooks]: The 'find' argument must be a non-empty string and not None.", + False, + ), + ( + {}, + None, + api_response, + None, + "[list_cookbooks]: The 'find' argument must be a non-empty string and not None.", + False, + ), + ( + [], + None, + api_response, + None, + "[list_cookbooks]: The 'find' argument must be a non-empty string and not None.", + False, + ), + ( + (), + None, + api_response, + None, + "[list_cookbooks]: The 'find' argument must be a non-empty string and not None.", + False, + ), + ( + True, + None, + api_response, + None, + "[list_cookbooks]: The 'find' argument must be a non-empty string and not None.", + False, + ), + # Invalid cases for pagination + ( + None, + "", + api_response, + None, + "[list_cookbooks]: The 'pagination' argument must be a non-empty string and not None.", + False, + ), + ( + None, + 99, + api_response, + None, + "[list_cookbooks]: The 'pagination' argument must be a non-empty string and not None.", + False, + ), + ( + None, + {}, + api_response, + None, + "[list_cookbooks]: The 'pagination' argument must be a non-empty string and not None.", + False, + ), + ( + None, + [], + api_response, + None, + "[list_cookbooks]: The 'pagination' argument must be a non-empty string and not None.", + False, + ), + ( + None, + (), + api_response, + None, + "[list_cookbooks]: The 'pagination' argument must be a non-empty string and not None.", + False, + ), + ( + None, + True, + api_response, + None, + "[list_cookbooks]: The 'pagination' argument must be a non-empty string and not None.", + False, + ), + ( + None, + True, + api_response, + None, + "[list_cookbooks]: The 'pagination' argument must be a non-empty string and not None.", + False, + ), + ( + None, + "(1, 'a')", + api_response, + None, + "[list_cookbooks]: The 'pagination' argument must be a tuple of two integers.", + False, + ), + ( + None, + "(1, 2, 3)", + api_response, + None, + "[list_cookbooks]: The 'pagination' argument must be a tuple of two integers.", + False, + ), + ( + None, + "(1, )", + api_response, + None, + "[list_cookbooks]: The 'pagination' argument must be a tuple of two integers.", + False, + ), + ( + None, + "(0, 1)", + api_response, + None, + "[list_cookbooks]: Invalid page number or page size. Page number and page size should start from 1.", + False, + ), + ( + None, + "(1, 0)", + api_response, + None, + "[list_cookbooks]: Invalid page number or page size. Page number and page size should start from 1.", + False, + ), + ( + None, + "(0, 0)", + api_response, + None, + "[list_cookbooks]: Invalid page number or page size. Page number and page size should start from 1.", + False, + ), + ( + None, + "(1, -1)", + api_response, + None, + "[list_cookbooks]: Invalid page number or page size. Page number and page size should start from 1.", + False, + ), + ( + None, + "(-1, 1)", + api_response, + None, + "[list_cookbooks]: Invalid page number or page size. Page number and page size should start from 1.", + False, + ), + ( + None, + "(-1, -1)", + api_response, + None, + "[list_cookbooks]: Invalid page number or page size. Page number and page size should start from 1.", + False, + ), + # Exception case + ( + None, + None, + api_response, + None, + "[list_cookbooks]: An error has occurred while listing cookbooks.", + False, + ), + ], + ) + @patch("moonshot.integrations.cli.benchmark.cookbook.api_get_all_cookbook") + @patch("moonshot.integrations.cli.benchmark.cookbook._display_cookbooks") + def test_list_cookbooks( + self, + mock_display_cookbooks, + mock_api_get_all_cookbook, + find, + pagination, + api_response, + expected_output, + expected_log, + to_be_called, + capsys, + ): + if "error" in expected_log: + mock_api_get_all_cookbook.side_effect = Exception( + "An error has occurred while listing cookbooks." + ) + else: + mock_api_get_all_cookbook.return_value = api_response + + class Args: + pass + + args = Args() + args.find = find + args.pagination = pagination + + result = list_cookbooks(args) + + captured = capsys.readouterr() + assert expected_log == captured.out.strip() + assert result == expected_output + + if to_be_called: + mock_display_cookbooks.assert_called_once_with(api_response) + else: + mock_display_cookbooks.assert_not_called() + + # ------------------------------------------------------------------------------ + # Test list_cookbooks functionality with mocked filter-data + # ------------------------------------------------------------------------------ + @pytest.mark.parametrize( + "find, pagination, api_response, filtered_response, expected_output, expected_log, to_be_called", + [ + ( + None, + None, + api_response, + api_response_pagination, + api_response_pagination, + "", + True, + ), + ( + "Cookbook", + None, + api_response, + api_response_pagination, + api_response_pagination, + "", + True, + ), + ( + None, + "(0, 1)", + api_response, + api_response_pagination, + api_response_pagination, + "", + True, + ), + # Case where filtered_response is None + ( + None, + None, + api_response, + None, + None, + "There are no cookbooks found.", + False, + ), + # Case where filtered_response is an empty list + ( + None, + None, + api_response, + [], + None, + "There are no cookbooks found.", + False, + ), + ], + ) + @patch("moonshot.integrations.cli.benchmark.cookbook.api_get_all_cookbook") + @patch("moonshot.integrations.cli.benchmark.cookbook._display_cookbooks") + @patch("moonshot.integrations.cli.benchmark.cookbook.filter_data") + def test_list_cookbooks_filtered( + self, + mock_filter_data, + mock_display_cookbooks, + mock_api_get_all_cookbook, + find, + pagination, + api_response, + filtered_response, + expected_output, + expected_log, + to_be_called, + capsys, + ): + mock_api_get_all_cookbook.return_value = api_response + mock_filter_data.return_value = filtered_response + + class Args: + pass + + args = Args() + args.find = find + args.pagination = pagination + + result = list_cookbooks(args) + + captured = capsys.readouterr() + assert expected_log == captured.out.strip() + assert result == expected_output + + if to_be_called: + mock_display_cookbooks.assert_called_once_with(filtered_response) + else: + mock_display_cookbooks.assert_not_called() + + # ------------------------------------------------------------------------------ + # Test view_cookbook functionality + # ------------------------------------------------------------------------------ + @pytest.mark.parametrize( + "cookbook_id, api_response, expected_log, to_be_called", + [ + # Valid case + ( + "1", + { + "id": 1, + "name": "Cookbook 1", + "description": "Desc 1", + "recipes": ["recipe1"], + }, + "", + True, + ), + # Invalid case: cookbook_id is None + ( + None, + { + "id": 1, + "name": "Cookbook 1", + "description": "Desc 1", + "recipes": ["recipe1"], + }, + "[view_cookbook]: The 'cookbook' argument must be a non-empty string and not None.", + False, + ), + # Invalid case: cookbook_id is not a string + ( + "", + { + "id": 1, + "name": "Cookbook 1", + "description": "Desc 1", + "recipes": ["recipe1"], + }, + "[view_cookbook]: The 'cookbook' argument must be a non-empty string and not None.", + False, + ), + ( + 123, + { + "id": 1, + "name": "Cookbook 1", + "description": "Desc 1", + "recipes": ["recipe1"], + }, + "[view_cookbook]: The 'cookbook' argument must be a non-empty string and not None.", + False, + ), + ( + {}, + { + "id": 1, + "name": "Cookbook 1", + "description": "Desc 1", + "recipes": ["recipe1"], + }, + "[view_cookbook]: The 'cookbook' argument must be a non-empty string and not None.", + False, + ), + ( + [], + { + "id": 1, + "name": "Cookbook 1", + "description": "Desc 1", + "recipes": ["recipe1"], + }, + "[view_cookbook]: The 'cookbook' argument must be a non-empty string and not None.", + False, + ), + ( + (), + { + "id": 1, + "name": "Cookbook 1", + "description": "Desc 1", + "recipes": ["recipe1"], + }, + "[view_cookbook]: The 'cookbook' argument must be a non-empty string and not None.", + False, + ), + ( + True, + { + "id": 1, + "name": "Cookbook 1", + "description": "Desc 1", + "recipes": ["recipe1"], + }, + "[view_cookbook]: The 'cookbook' argument must be a non-empty string and not None.", + False, + ), + # Exception case: api_read_cookbook raises an exception + ( + "1", + { + "id": 1, + "name": "Cookbook 1", + "description": "Desc 1", + "recipes": ["recipe1"], + }, + "[view_cookbook]: An error has occurred while reading the cookbook.", + False, + ), + ], + ) + @patch("moonshot.integrations.cli.benchmark.cookbook.api_read_cookbook") + @patch("moonshot.integrations.cli.benchmark.cookbook._display_view_cookbook") + def test_view_cookbook( + self, + mock_display_view_cookbook, + mock_api_read_cookbook, + cookbook_id, + api_response, + expected_log, + to_be_called, + capsys, + ): + if "error" in expected_log: + mock_api_read_cookbook.side_effect = Exception( + "An error has occurred while reading the cookbook." + ) + else: + mock_api_read_cookbook.return_value = api_response + + class Args: + pass + + args = Args() + args.cookbook = cookbook_id + + view_cookbook(args) + + captured = capsys.readouterr() + assert expected_log == captured.out.strip() + + if to_be_called: + mock_display_view_cookbook.assert_called_once_with(api_response) + else: + mock_display_view_cookbook.assert_not_called() + + # ------------------------------------------------------------------------------ + # Test run_cookbook functionality + # ------------------------------------------------------------------------------ + @pytest.mark.parametrize( + "name, cookbooks, endpoints, num_of_prompts, random_seed, system_prompt, \ + runner_proc_module, result_proc_module, expected_log", + [ + # Valid case + ( + "Test Runner", + "['cookbook1', 'cookbook2']", + "['endpoint1', 'endpoint2']", + 10, + 42, + "Test system prompt", + "runner_module", + "result_module", + "", + ), + # Invalid case: name + ( + "", + "['cookbook1', 'cookbook2']", + "['endpoint1', 'endpoint2']", + 10, + 42, + "Test system prompt", + "runner_module", + "result_module", + "[run_cookbook]: The 'name' argument must be a non-empty string and not None.", + ), + ( + None, + "['cookbook1', 'cookbook2']", + "['endpoint1', 'endpoint2']", + 10, + 42, + "Test system prompt", + "runner_module", + "result_module", + "[run_cookbook]: The 'name' argument must be a non-empty string and not None.", + ), + ( + 123, + "['cookbook1', 'cookbook2']", + "['endpoint1', 'endpoint2']", + 10, + 42, + "Test system prompt", + "runner_module", + "result_module", + "[run_cookbook]: The 'name' argument must be a non-empty string and not None.", + ), + ( + {}, + "['cookbook1', 'cookbook2']", + "['endpoint1', 'endpoint2']", + 10, + 42, + "Test system prompt", + "runner_module", + "result_module", + "[run_cookbook]: The 'name' argument must be a non-empty string and not None.", + ), + ( + [], + "['cookbook1', 'cookbook2']", + "['endpoint1', 'endpoint2']", + 10, + 42, + "Test system prompt", + "runner_module", + "result_module", + "[run_cookbook]: The 'name' argument must be a non-empty string and not None.", + ), + ( + (), + "['cookbook1', 'cookbook2']", + "['endpoint1', 'endpoint2']", + 10, + 42, + "Test system prompt", + "runner_module", + "result_module", + "[run_cookbook]: The 'name' argument must be a non-empty string and not None.", + ), + ( + True, + "['cookbook1', 'cookbook2']", + "['endpoint1', 'endpoint2']", + 10, + 42, + "Test system prompt", + "runner_module", + "result_module", + "[run_cookbook]: The 'name' argument must be a non-empty string and not None.", + ), + # Invalid case: cookbooks is not a list of string + ( + "Test Runner", + "[123, 123]", + "['endpoint1', 'endpoint2']", + 10, + 42, + "Test system prompt", + "runner_module", + "result_module", + "[run_cookbook]: The 'cookbooks' argument must evaluate to a list of strings.", + ), + # Invalid case: cookbooks is not a string + ( + "Test Runner", + None, + "['endpoint1', 'endpoint2']", + 10, + 42, + "Test system prompt", + "runner_module", + "result_module", + "[run_cookbook]: The 'cookbooks' argument must be a non-empty string and not None.", + ), + ( + "Test Runner", + "", + "['endpoint1', 'endpoint2']", + 10, + 42, + "Test system prompt", + "runner_module", + "result_module", + "[run_cookbook]: The 'cookbooks' argument must be a non-empty string and not None.", + ), + ( + "Test Runner", + 123, + "['endpoint1', 'endpoint2']", + 10, + 42, + "Test system prompt", + "runner_module", + "result_module", + "[run_cookbook]: The 'cookbooks' argument must be a non-empty string and not None.", + ), + ( + "Test Runner", + {}, + "['endpoint1', 'endpoint2']", + 10, + 42, + "Test system prompt", + "runner_module", + "result_module", + "[run_cookbook]: The 'cookbooks' argument must be a non-empty string and not None.", + ), + ( + "Test Runner", + [], + "['endpoint1', 'endpoint2']", + 10, + 42, + "Test system prompt", + "runner_module", + "result_module", + "[run_cookbook]: The 'cookbooks' argument must be a non-empty string and not None.", + ), + ( + "Test Runner", + (), + "['endpoint1', 'endpoint2']", + 10, + 42, + "Test system prompt", + "runner_module", + "result_module", + "[run_cookbook]: The 'cookbooks' argument must be a non-empty string and not None.", + ), + ( + "Test Runner", + True, + "['endpoint1', 'endpoint2']", + 10, + 42, + "Test system prompt", + "runner_module", + "result_module", + "[run_cookbook]: The 'cookbooks' argument must be a non-empty string and not None.", + ), + # Invalid case: endpoints is not a list of string + ( + "Test Runner", + "['cookbook1', 'cookbook2']", + "[123, 123]", + 10, + 42, + "Test system prompt", + "runner_module", + "result_module", + "[run_cookbook]: The 'endpoints' argument must evaluate to a list of strings.", + ), + # Invalid case: endpoints is not a string + ( + "Test Runner", + "['cookbook1', 'cookbook2']", + None, + 10, + 42, + "Test system prompt", + "runner_module", + "result_module", + "[run_cookbook]: The 'endpoints' argument must be a non-empty string and not None.", + ), + ( + "Test Runner", + "['cookbook1', 'cookbook2']", + "", + 10, + 42, + "Test system prompt", + "runner_module", + "result_module", + "[run_cookbook]: The 'endpoints' argument must be a non-empty string and not None.", + ), + ( + "Test Runner", + "['cookbook1', 'cookbook2']", + 123, + 10, + 42, + "Test system prompt", + "runner_module", + "result_module", + "[run_cookbook]: The 'endpoints' argument must be a non-empty string and not None.", + ), + ( + "Test Runner", + "['cookbook1', 'cookbook2']", + {}, + 10, + 42, + "Test system prompt", + "runner_module", + "result_module", + "[run_cookbook]: The 'endpoints' argument must be a non-empty string and not None.", + ), + ( + "Test Runner", + "['cookbook1', 'cookbook2']", + [], + 10, + 42, + "Test system prompt", + "runner_module", + "result_module", + "[run_cookbook]: The 'endpoints' argument must be a non-empty string and not None.", + ), + ( + "Test Runner", + "['cookbook1', 'cookbook2']", + (), + 10, + 42, + "Test system prompt", + "runner_module", + "result_module", + "[run_cookbook]: The 'endpoints' argument must be a non-empty string and not None.", + ), + ( + "Test Runner", + "['cookbook1', 'cookbook2']", + True, + 10, + 42, + "Test system prompt", + "runner_module", + "result_module", + "[run_cookbook]: The 'endpoints' argument must be a non-empty string and not None.", + ), + # Invalid case: num_of_prompts is not an integer + ( + "Test Runner", + "['cookbook1', 'cookbook2']", + "['endpoint1', 'endpoint2']", + None, + 42, + "Test system prompt", + "runner_module", + "result_module", + "[run_cookbook]: The 'num_of_prompts' argument must be an integer.", + ), + ( + "Test Runner", + "['cookbook1', 'cookbook2']", + "['endpoint1', 'endpoint2']", + "", + 42, + "Test system prompt", + "runner_module", + "result_module", + "[run_cookbook]: The 'num_of_prompts' argument must be an integer.", + ), + ( + "Test Runner", + "['cookbook1', 'cookbook2']", + "['endpoint1', 'endpoint2']", + {}, + 42, + "Test system prompt", + "runner_module", + "result_module", + "[run_cookbook]: The 'num_of_prompts' argument must be an integer.", + ), + ( + "Test Runner", + "['cookbook1', 'cookbook2']", + "['endpoint1', 'endpoint2']", + [], + 42, + "Test system prompt", + "runner_module", + "result_module", + "[run_cookbook]: The 'num_of_prompts' argument must be an integer.", + ), + ( + "Test Runner", + "['cookbook1', 'cookbook2']", + "['endpoint1', 'endpoint2']", + (), + 42, + "Test system prompt", + "runner_module", + "result_module", + "[run_cookbook]: The 'num_of_prompts' argument must be an integer.", + ), + ( + "Test Runner", + "['cookbook1', 'cookbook2']", + "['endpoint1', 'endpoint2']", + True, + 42, + "Test system prompt", + "runner_module", + "result_module", + "[run_cookbook]: The 'num_of_prompts' argument must be an integer.", + ), + # Invalid case: random_seed is not an integer + ( + "Test Runner", + "['cookbook1', 'cookbook2']", + "['endpoint1', 'endpoint2']", + 10, + None, + "Test system prompt", + "runner_module", + "result_module", + "[run_cookbook]: The 'random_seed' argument must be an integer.", + ), + ( + "Test Runner", + "['cookbook1', 'cookbook2']", + "['endpoint1', 'endpoint2']", + 10, + "", + "Test system prompt", + "runner_module", + "result_module", + "[run_cookbook]: The 'random_seed' argument must be an integer.", + ), + ( + "Test Runner", + "['cookbook1', 'cookbook2']", + "['endpoint1', 'endpoint2']", + 10, + {}, + "Test system prompt", + "runner_module", + "result_module", + "[run_cookbook]: The 'random_seed' argument must be an integer.", + ), + ( + "Test Runner", + "['cookbook1', 'cookbook2']", + "['endpoint1', 'endpoint2']", + 10, + [], + "Test system prompt", + "runner_module", + "result_module", + "[run_cookbook]: The 'random_seed' argument must be an integer.", + ), + ( + "Test Runner", + "['cookbook1', 'cookbook2']", + "['endpoint1', 'endpoint2']", + 10, + (), + "Test system prompt", + "runner_module", + "result_module", + "[run_cookbook]: The 'random_seed' argument must be an integer.", + ), + ( + "Test Runner", + "['cookbook1', 'cookbook2']", + "['endpoint1', 'endpoint2']", + 10, + True, + "Test system prompt", + "runner_module", + "result_module", + "[run_cookbook]: The 'random_seed' argument must be an integer.", + ), + # Invalid case: system_prompt is None + ( + "Test Runner", + "['cookbook1', 'cookbook2']", + "['endpoint1', 'endpoint2']", + 10, + 42, + None, + "runner_module", + "result_module", + "[run_cookbook]: The 'system_prompt' argument must be a non-empty string and not None.", + ), + ( + "Test Runner", + "['cookbook1', 'cookbook2']", + "['endpoint1', 'endpoint2']", + 10, + 42, + "", + "runner_module", + "result_module", + "[run_cookbook]: The 'system_prompt' argument must be a non-empty string and not None.", + ), + ( + "Test Runner", + "['cookbook1', 'cookbook2']", + "['endpoint1', 'endpoint2']", + 10, + 42, + {}, + "runner_module", + "result_module", + "[run_cookbook]: The 'system_prompt' argument must be a non-empty string and not None.", + ), + ( + "Test Runner", + "['cookbook1', 'cookbook2']", + "['endpoint1', 'endpoint2']", + 10, + 42, + [], + "runner_module", + "result_module", + "[run_cookbook]: The 'system_prompt' argument must be a non-empty string and not None.", + ), + ( + "Test Runner", + "['cookbook1', 'cookbook2']", + "['endpoint1', 'endpoint2']", + 10, + 42, + (), + "runner_module", + "result_module", + "[run_cookbook]: The 'system_prompt' argument must be a non-empty string and not None.", + ), + ( + "Test Runner", + "['cookbook1', 'cookbook2']", + "['endpoint1', 'endpoint2']", + 10, + 42, + True, + "runner_module", + "result_module", + "[run_cookbook]: The 'system_prompt' argument must be a non-empty string and not None.", + ), + # Invalid case: runner_proc_module is None + ( + "Test Runner", + "['cookbook1', 'cookbook2']", + "['endpoint1', 'endpoint2']", + 10, + 42, + "Test system prompt", + None, + "result_module", + "[run_cookbook]: The 'runner_proc_module' argument must be a non-empty string and not None.", + ), + ( + "Test Runner", + "['cookbook1', 'cookbook2']", + "['endpoint1', 'endpoint2']", + 10, + 42, + "Test system prompt", + "", + "result_module", + "[run_cookbook]: The 'runner_proc_module' argument must be a non-empty string and not None.", + ), + ( + "Test Runner", + "['cookbook1', 'cookbook2']", + "['endpoint1', 'endpoint2']", + 10, + 42, + "Test system prompt", + {}, + "result_module", + "[run_cookbook]: The 'runner_proc_module' argument must be a non-empty string and not None.", + ), + ( + "Test Runner", + "['cookbook1', 'cookbook2']", + "['endpoint1', 'endpoint2']", + 10, + 42, + "Test system prompt", + [], + "result_module", + "[run_cookbook]: The 'runner_proc_module' argument must be a non-empty string and not None.", + ), + ( + "Test Runner", + "['cookbook1', 'cookbook2']", + "['endpoint1', 'endpoint2']", + 10, + 42, + "Test system prompt", + (), + "result_module", + "[run_cookbook]: The 'runner_proc_module' argument must be a non-empty string and not None.", + ), + ( + "Test Runner", + "['cookbook1', 'cookbook2']", + "['endpoint1', 'endpoint2']", + 10, + 42, + "Test system prompt", + True, + "result_module", + "[run_cookbook]: The 'runner_proc_module' argument must be a non-empty string and not None.", + ), + # Invalid case: result_proc_module is None + ( + "Test Runner", + "['cookbook1', 'cookbook2']", + "['endpoint1', 'endpoint2']", + 10, + 42, + "Test system prompt", + "runner_module", + None, + "[run_cookbook]: The 'result_proc_module' argument must be a non-empty string and not None.", + ), + ( + "Test Runner", + "['cookbook1', 'cookbook2']", + "['endpoint1', 'endpoint2']", + 10, + 42, + "Test system prompt", + "runner_module", + "", + "[run_cookbook]: The 'result_proc_module' argument must be a non-empty string and not None.", + ), + ( + "Test Runner", + "['cookbook1', 'cookbook2']", + "['endpoint1', 'endpoint2']", + 10, + 42, + "Test system prompt", + "runner_module", + {}, + "[run_cookbook]: The 'result_proc_module' argument must be a non-empty string and not None.", + ), + ( + "Test Runner", + "['cookbook1', 'cookbook2']", + "['endpoint1', 'endpoint2']", + 10, + 42, + "Test system prompt", + "runner_module", + [], + "[run_cookbook]: The 'result_proc_module' argument must be a non-empty string and not None.", + ), + ( + "Test Runner", + "['cookbook1', 'cookbook2']", + "['endpoint1', 'endpoint2']", + 10, + 42, + "Test system prompt", + "runner_module", + (), + "[run_cookbook]: The 'result_proc_module' argument must be a non-empty string and not None.", + ), + ( + "Test Runner", + "['cookbook1', 'cookbook2']", + "['endpoint1', 'endpoint2']", + 10, + 42, + "Test system prompt", + "runner_module", + True, + "[run_cookbook]: The 'result_proc_module' argument must be a non-empty string and not None.", + ), + # Exception case: api_create_runner raises an exception + ( + "Test Runner", + "['cookbook1', 'cookbook2']", + "['endpoint1', 'endpoint2']", + 10, + 42, + "Test system prompt", + "runner_module", + "result_module", + "[run_cookbook]: An error has occurred while creating the runner.", + ), + # Exception case: api_load_runner raises an exception + ( + "Test Runner", + "['cookbook1', 'cookbook2']", + "['endpoint1', 'endpoint2']", + 10, + 42, + "Test system prompt", + "runner_module", + "result_module", + "[run_cookbook]: An error has occurred while loading the runner.", + ), + # Exception case: api_get_all_runner_name raises an exception + ( + "Test Runner", + "['cookbook1', 'cookbook2']", + "['endpoint1', 'endpoint2']", + 10, + 42, + "Test system prompt", + "runner_module", + "result_module", + "[run_cookbook]: An error has occurred while getting all runner names.", + ), + # Exception case: api_get_all_run raises an exception + ( + "Test Runner", + "['cookbook1', 'cookbook2']", + "['endpoint1', 'endpoint2']", + 10, + 42, + "Test system prompt", + "runner_module", + "result_module", + "[run_cookbook]: An error has occurred while getting all runs.", + ), + # Exception case: no results raises an exception + ( + "Test Runner", + "['cookbook1', 'cookbook2']", + "['endpoint1', 'endpoint2']", + 10, + 42, + "Test system prompt", + "runner_module", + "result_module", + "[run_cookbook]: There are no results generated.", + ), + # Exception case: show_cookbook_results raises an exception + ( + "Test Runner", + "['cookbook1', 'cookbook2']", + "['endpoint1', 'endpoint2']", + 10, + 42, + "Test system prompt", + "runner_module", + "result_module", + "[run_cookbook]: An error has occurred while showing cookbook results.", + ), + ], + ) + @patch("moonshot.integrations.cli.benchmark.cookbook.api_get_all_runner_name") + @patch("moonshot.integrations.cli.benchmark.cookbook.api_load_runner") + @patch("moonshot.integrations.cli.benchmark.cookbook.api_create_runner") + @patch("moonshot.integrations.cli.benchmark.cookbook.api_get_all_run") + @patch("moonshot.integrations.cli.benchmark.cookbook._show_cookbook_results") + def test_run_cookbook( + self, + mock_show_cookbook_results, + mock_api_get_all_run, + mock_api_create_runner, + mock_api_load_runner, + mock_api_get_all_runner_name, + name, + cookbooks, + endpoints, + num_of_prompts, + random_seed, + system_prompt, + runner_proc_module, + result_proc_module, + expected_log, + capsys, + ): + to_trigger_called = False + + if "getting all runner names" in expected_log: + mock_api_get_all_runner_name.side_effect = Exception( + "An error has occurred while getting all runner names." + ) + + elif "creating the runner" in expected_log: + mock_api_get_all_runner_name.return_value = [] + mock_api_create_runner.side_effect = Exception( + "An error has occurred while creating the runner." + ) + + elif "loading the runner" in expected_log: + mock_api_get_all_runner_name.return_value = ["test-runner"] + mock_api_load_runner.side_effect = Exception( + "An error has occurred while loading the runner." + ) + + elif "getting all runs" in expected_log: + mock_api_get_all_runner_name.return_value = [] + mock_api_create_runner.return_value = AsyncMock() + mock_api_get_all_run.side_effect = Exception( + "An error has occurred while getting all runs." + ) + + elif "showing cookbook results" in expected_log: + to_trigger_called = True + mock_api_get_all_runner_name.return_value = [] + mock_api_create_runner.return_value = AsyncMock() + mock_api_get_all_run.return_value = [ + {"results": {"metadata": {"duration": 10}}} + ] + mock_show_cookbook_results.side_effect = Exception( + "An error has occurred while showing cookbook results." + ) + + elif "no results" in expected_log: + mock_api_get_all_runner_name.return_value = [] + mock_api_create_runner.return_value = AsyncMock() + mock_api_get_all_run.return_value = [ + {"someresults": {"metadata": {"duration": 10}}} + ] + + else: + mock_api_create_runner.return_value = AsyncMock() + mock_api_load_runner.return_value = AsyncMock() + mock_api_get_all_runner_name.return_value = [] + mock_api_get_all_run.return_value = [ + {"results": {"metadata": {"duration": 10}}} + ] + + class Args: + pass + + args = Args() + args.name = name + args.cookbooks = cookbooks + args.endpoints = endpoints + args.num_of_prompts = num_of_prompts + args.random_seed = random_seed + args.system_prompt = system_prompt + args.runner_proc_module = runner_proc_module + args.result_proc_module = result_proc_module + + run_cookbook(args) + + captured = capsys.readouterr() + assert expected_log == captured.out.strip() + + if not expected_log or to_trigger_called: + mock_show_cookbook_results.assert_called_once() + else: + mock_show_cookbook_results.assert_not_called() + + # ------------------------------------------------------------------------------ + # Test update_cookbook functionality + # ------------------------------------------------------------------------------ + @pytest.mark.parametrize( + "cookbook, update_values, expected_log, to_be_called", + [ + # Valid case + ( + "Cookbook 1", + "[('name', 'Updated Cookbook'), ('description', 'Updated description')]", + "[update_cookbook]: Cookbook updated.", + True, + ), + # Invalid case - cookbook + ( + "", + "[('name', 'Updated Cookbook'), ('description', 'Updated description')]", + "[update_cookbook]: The 'cookbook' argument must be a non-empty string and not None.", + False, + ), + ( + None, + "[('name', 'Updated Cookbook'), ('description', 'Updated description')]", + "[update_cookbook]: The 'cookbook' argument must be a non-empty string and not None.", + False, + ), + ( + 123, + "[('name', 'Updated Cookbook'), ('description', 'Updated description')]", + "[update_cookbook]: The 'cookbook' argument must be a non-empty string and not None.", + False, + ), + ( + {}, + "[('name', 'Updated Cookbook'), ('description', 'Updated description')]", + "[update_cookbook]: The 'cookbook' argument must be a non-empty string and not None.", + False, + ), + ( + [], + "[('name', 'Updated Cookbook'), ('description', 'Updated description')]", + "[update_cookbook]: The 'cookbook' argument must be a non-empty string and not None.", + False, + ), + ( + (), + "[('name', 'Updated Cookbook'), ('description', 'Updated description')]", + "[update_cookbook]: The 'cookbook' argument must be a non-empty string and not None.", + False, + ), + ( + True, + "[('name', 'Updated Cookbook'), ('description', 'Updated description')]", + "[update_cookbook]: The 'cookbook' argument must be a non-empty string and not None.", + False, + ), + # Invalid case - update values + ( + "Cookbook 1", + "", + "[update_cookbook]: The 'update_values' argument must be a non-empty string and not None.", + False, + ), + ( + "Cookbook 1", + "['', '']", + "[update_cookbook]: The 'update_values' argument must evaluate to a list of tuples.", + False, + ), + ( + "Cookbook 1", + "[[], ()]", + "[update_cookbook]: The 'update_values' argument must evaluate to a list of tuples.", + False, + ), + ( + "Cookbook 1", + None, + "[update_cookbook]: The 'update_values' argument must be a non-empty string and not None.", + False, + ), + ( + "Cookbook 1", + 123, + "[update_cookbook]: The 'update_values' argument must be a non-empty string and not None.", + False, + ), + ( + "Cookbook 1", + {}, + "[update_cookbook]: The 'update_values' argument must be a non-empty string and not None.", + False, + ), + ( + "Cookbook 1", + [], + "[update_cookbook]: The 'update_values' argument must be a non-empty string and not None.", + False, + ), + ( + "Cookbook 1", + (), + "[update_cookbook]: The 'update_values' argument must be a non-empty string and not None.", + False, + ), + ( + "Cookbook 1", + True, + "[update_cookbook]: The 'update_values' argument must be a non-empty string and not None.", + False, + ), + # Test case: API update raises an exception + ( + "Cookbook 1", + "[('name', 'Updated Cookbook'), ('description', 'Updated description')]", + "[update_cookbook]: An error has occurred while updating the cookbook.", + True, + ), + ], + ) + @patch("moonshot.integrations.cli.benchmark.cookbook.api_update_cookbook") + def test_update_cookbook( + self, + mock_api_update_cookbook, + capsys, + cookbook, + update_values, + expected_log, + to_be_called, + ): + if "error" in expected_log: + mock_api_update_cookbook.side_effect = Exception( + "An error has occurred while updating the cookbook." + ) + else: + mock_api_update_cookbook.return_value = "updated" + + class Args: + pass + + args = Args() + args.cookbook = cookbook + args.update_values = update_values + + update_cookbook(args) + + captured = capsys.readouterr() + assert expected_log == captured.out.strip() + + if to_be_called: + mock_api_update_cookbook.assert_called_once_with( + args.cookbook, **dict(literal_eval(args.update_values)) + ) + else: + mock_api_update_cookbook.assert_not_called() + + # ------------------------------------------------------------------------------ + # Test delete_cookbook functionality + # ------------------------------------------------------------------------------ + @pytest.mark.parametrize( + "cookbook, expected_log, to_be_called", + [ + # Valid case + ("Cookbook 1", "[delete_cookbook]: Cookbook deleted.", True), + # Invalid case - cookbook + ( + "", + "[delete_cookbook]: The 'cookbook' argument must be a non-empty string and not None.", + False, + ), + ( + None, + "[delete_cookbook]: The 'cookbook' argument must be a non-empty string and not None.", + False, + ), + ( + 123, + "[delete_cookbook]: The 'cookbook' argument must be a non-empty string and not None.", + False, + ), + ( + {}, + "[delete_cookbook]: The 'cookbook' argument must be a non-empty string and not None.", + False, + ), + ( + [], + "[delete_cookbook]: The 'cookbook' argument must be a non-empty string and not None.", + False, + ), + ( + (), + "[delete_cookbook]: The 'cookbook' argument must be a non-empty string and not None.", + False, + ), + ( + True, + "[delete_cookbook]: The 'cookbook' argument must be a non-empty string and not None.", + False, + ), + ], + ) + @patch("moonshot.integrations.cli.benchmark.cookbook.api_delete_cookbook") + def test_delete_cookbook( + self, mock_api_delete_cookbook, capsys, cookbook, expected_log, to_be_called + ): + class Args: + pass + + args = Args() + args.cookbook = cookbook + + with patch( + "moonshot.integrations.cli.benchmark.cookbook.console.input", + return_value="y", + ): + with patch("moonshot.integrations.cli.benchmark.cookbook.console.print"): + delete_cookbook(args) + + captured = capsys.readouterr() + assert expected_log == captured.out.strip() + + if to_be_called: + mock_api_delete_cookbook.assert_called_once_with(args.cookbook) + else: + mock_api_delete_cookbook.assert_not_called() + + @patch( + "moonshot.integrations.cli.benchmark.cookbook.console.input", return_value="y" + ) + @patch("moonshot.integrations.cli.benchmark.cookbook.api_delete_cookbook") + def test_delete_cookbook_confirm_yes(self, mock_delete, mock_input): + args = MagicMock() + args.cookbook = "test_cookbook_id" + + delete_cookbook(args) + + mock_input.assert_called_once_with( + "[bold red]Are you sure you want to delete the cookbook (y/N)? [/]" + ) + mock_delete.assert_called_once_with("test_cookbook_id") + + @patch( + "moonshot.integrations.cli.benchmark.cookbook.console.input", return_value="n" + ) + @patch("moonshot.integrations.cli.benchmark.cookbook.api_delete_cookbook") + def test_delete_cookbook_confirm_no(self, mock_delete, mock_input): + args = MagicMock() + args.cookbook = "test_cookbook_id" + + delete_cookbook(args) + + mock_input.assert_called_once_with( + "[bold red]Are you sure you want to delete the cookbook (y/N)? [/]" + ) + mock_delete.assert_not_called() + + @patch( + "moonshot.integrations.cli.benchmark.cookbook.console.input", return_value="n" + ) + @patch("moonshot.integrations.cli.benchmark.cookbook.console.print") + @patch("moonshot.integrations.cli.benchmark.cookbook.api_delete_cookbook") + def test_delete_cookbook_cancelled_output( + self, mock_delete, mock_print, mock_input + ): + args = MagicMock() + args.cookbook = "test_cookbook_id" + + delete_cookbook(args) + + mock_input.assert_called_once_with( + "[bold red]Are you sure you want to delete the cookbook (y/N)? [/]" + ) + mock_print.assert_called_once_with( + "[bold yellow]Cookbook deletion cancelled.[/]" + ) + mock_delete.assert_not_called() diff --git a/tests/unit-tests/cli/test_datasets.py b/tests/unit-tests/cli/test_datasets.py new file mode 100644 index 00000000..150df012 --- /dev/null +++ b/tests/unit-tests/cli/test_datasets.py @@ -0,0 +1,658 @@ +from unittest.mock import MagicMock, patch + +import pytest + +from moonshot.integrations.cli.benchmark.datasets import ( + delete_dataset, + list_datasets, + view_dataset, +) + + +class TestCollectionCliDataset: + api_response = [ + { + "id": "squad-shifts-tnf", + "name": "squad-shifts-tnf", + "description": "Some description", + "examples": None, + "num_of_dataset_prompts": 48201, + "created_date": "2024-05-27 16:48:35", + "reference": "Some reference", + "license": "", + } + ] + api_response_pagination = [ + { + "id": "squad-shifts-tnf", + "name": "squad-shifts-tnf", + "description": "Some description", + "examples": None, + "num_of_dataset_prompts": 48201, + "created_date": "2024-05-27 16:48:35", + "reference": "Some reference", + "license": "", + "idx": 1, + } + ] + + @pytest.fixture(autouse=True) + def init(self): + # Perform tests + yield + + # ------------------------------------------------------------------------------ + # Test list_datasets functionality with non-mocked filter-data + # ------------------------------------------------------------------------------ + @pytest.mark.parametrize( + "find, pagination, api_response, expected_output, expected_log, to_be_called", + [ + # Valid cases + ( + None, + None, + api_response, + api_response, + "Listing datasets may take a while...", + True, + ), + # No datasets + ( + None, + None, + [], + None, + "Listing datasets may take a while...\nThere are no datasets found.", + False, + ), + ( + "squad", + None, + api_response, + api_response, + "Listing datasets may take a while...", + True, + ), + ( + None, + "(1, 1)", + api_response, + api_response_pagination, + "Listing datasets may take a while...", + True, + ), + ( + "Dataset", + "(1, 1)", + api_response, + None, + "Listing datasets may take a while...\nThere are no datasets found.", + False, + ), + # Invalid cases for find + ( + "", + None, + api_response, + None, + "Listing datasets may take a while...\n[list_datasets]: The 'find' argument must be a non-empty string and not None.", + False, + ), + ( + 99, + None, + api_response, + None, + "Listing datasets may take a while...\n[list_datasets]: The 'find' argument must be a non-empty string and not None.", + False, + ), + ( + {}, + None, + api_response, + None, + "Listing datasets may take a while...\n[list_datasets]: The 'find' argument must be a non-empty string and not None.", + False, + ), + ( + [], + None, + api_response, + None, + "Listing datasets may take a while...\n[list_datasets]: The 'find' argument must be a non-empty string and not None.", + False, + ), + ( + (), + None, + api_response, + None, + "Listing datasets may take a while...\n[list_datasets]: The 'find' argument must be a non-empty string and not None.", + False, + ), + ( + True, + None, + api_response, + None, + "Listing datasets may take a while...\n[list_datasets]: The 'find' argument must be a non-empty string and not None.", + False, + ), + # Invalid cases for pagination + ( + None, + "", + api_response, + None, + "Listing datasets may take a while...\n[list_datasets]: The 'pagination' argument must be a non-empty string and not None.", + False, + ), + ( + None, + 99, + api_response, + None, + "Listing datasets may take a while...\n[list_datasets]: The 'pagination' argument must be a non-empty string and not None.", + False, + ), + ( + None, + {}, + api_response, + None, + "Listing datasets may take a while...\n[list_datasets]: The 'pagination' argument must be a non-empty string and not None.", + False, + ), + ( + None, + [], + api_response, + None, + "Listing datasets may take a while...\n[list_datasets]: The 'pagination' argument must be a non-empty string and not None.", + False, + ), + ( + None, + (), + api_response, + None, + "Listing datasets may take a while...\n[list_datasets]: The 'pagination' argument must be a non-empty string and not None.", + False, + ), + ( + None, + True, + api_response, + None, + "Listing datasets may take a while...\n[list_datasets]: The 'pagination' argument must be a non-empty string and not None.", + False, + ), + ( + None, + True, + api_response, + None, + "Listing datasets may take a while...\n[list_datasets]: The 'pagination' argument must be a non-empty string and not None.", + False, + ), + ( + None, + "(1, 'a')", + api_response, + None, + "Listing datasets may take a while...\n[list_datasets]: The 'pagination' argument must be a tuple of two integers.", + False, + ), + ( + None, + "(1, 2, 3)", + api_response, + None, + "Listing datasets may take a while...\n[list_datasets]: The 'pagination' argument must be a tuple of two integers.", + False, + ), + ( + None, + "(1, )", + api_response, + None, + "Listing datasets may take a while...\n[list_datasets]: The 'pagination' argument must be a tuple of two integers.", + False, + ), + ( + None, + "(0, 1)", + api_response, + None, + "Listing datasets may take a while...\n[list_datasets]: Invalid page number or page size. Page number and page size should start from 1.", + False, + ), + ( + None, + "(1, 0)", + api_response, + None, + "Listing datasets may take a while...\n[list_datasets]: Invalid page number or page size. Page number and page size should start from 1.", + False, + ), + ( + None, + "(0, 0)", + api_response, + None, + "Listing datasets may take a while...\n[list_datasets]: Invalid page number or page size. Page number and page size should start from 1.", + False, + ), + ( + None, + "(1, -1)", + api_response, + None, + "Listing datasets may take a while...\n[list_datasets]: Invalid page number or page size. Page number and page size should start from 1.", + False, + ), + ( + None, + "(-1, 1)", + api_response, + None, + "Listing datasets may take a while...\n[list_datasets]: Invalid page number or page size. Page number and page size should start from 1.", + False, + ), + ( + None, + "(-1, -1)", + api_response, + None, + "Listing datasets may take a while...\n[list_datasets]: Invalid page number or page size. Page number and page size should start from 1.", + False, + ), + # Exception case + ( + None, + None, + api_response, + None, + "Listing datasets may take a while...\n[list_datasets]: An error has occurred while listing datasets.", + False, + ), + ], + ) + @patch("moonshot.integrations.cli.benchmark.datasets.api_get_all_datasets") + @patch("moonshot.integrations.cli.benchmark.datasets._display_datasets") + def test_list_datasets( + self, + mock_display_datasets, + mock_api_get_all_datasets, + find, + pagination, + api_response, + expected_output, + expected_log, + to_be_called, + capsys, + ): + if "error" in expected_log: + mock_api_get_all_datasets.side_effect = Exception( + "An error has occurred while listing datasets." + ) + else: + mock_api_get_all_datasets.return_value = api_response + + class Args: + pass + + args = Args() + args.find = find + args.pagination = pagination + + result = list_datasets(args) + + captured = capsys.readouterr() + assert expected_log == captured.out.strip() + assert result == expected_output + + if to_be_called: + mock_display_datasets.assert_called_once_with(api_response) + else: + mock_display_datasets.assert_not_called() + + # ------------------------------------------------------------------------------ + # Test list_datasets functionality with mocked filter-data + # ------------------------------------------------------------------------------ + @pytest.mark.parametrize( + "find, pagination, api_response, filtered_response, expected_output, expected_log, to_be_called", + [ + ( + None, + None, + api_response, + api_response_pagination, + api_response_pagination, + "Listing datasets may take a while...", + True, + ), + ( + "squad", + None, + api_response, + api_response_pagination, + api_response_pagination, + "Listing datasets may take a while...", + True, + ), + ( + None, + "(0, 1)", + api_response, + api_response_pagination, + api_response_pagination, + "Listing datasets may take a while...", + True, + ), + # Case where filtered_response is None + ( + None, + None, + api_response, + None, + None, + "Listing datasets may take a while...\nThere are no datasets found.", + False, + ), + # Case where filtered_response is an empty list + ( + None, + None, + api_response, + [], + None, + "Listing datasets may take a while...\nThere are no datasets found.", + False, + ), + ], + ) + @patch("moonshot.integrations.cli.benchmark.datasets.api_get_all_datasets") + @patch("moonshot.integrations.cli.benchmark.datasets._display_datasets") + @patch("moonshot.integrations.cli.benchmark.datasets.filter_data") + def test_list_datasets_filtered( + self, + mock_filter_data, + mock_display_datasets, + mock_api_get_all_datasets, + find, + pagination, + api_response, + filtered_response, + expected_output, + expected_log, + to_be_called, + capsys, + ): + mock_api_get_all_datasets.return_value = api_response + mock_filter_data.return_value = filtered_response + + class Args: + pass + + args = Args() + args.find = find + args.pagination = pagination + + result = list_datasets(args) + + captured = capsys.readouterr() + assert expected_log == captured.out.strip() + assert result == expected_output + + if to_be_called: + mock_display_datasets.assert_called_once_with(filtered_response) + else: + mock_display_datasets.assert_not_called() + + # ------------------------------------------------------------------------------ + # Test view_dataset functionality + # ------------------------------------------------------------------------------ + @pytest.mark.parametrize( + "dataset_filename, api_response, api_name_response, expected_log, to_be_called", + [ + # Valid case + ( + "squad-shifts-tnf", + api_response, + ["squad-shifts-tnf"], + "Viewing datasets may take a while...", + True, + ), + # Invalid case: dataset_filename is None + ( + None, + api_response, + ["squad-shifts-tnf"], + "Viewing datasets may take a while...\n[view_dataset]: The 'dataset_filename' argument must be a non-empty string and not None.", + False, + ), + # Invalid case: dataset_filename is not a string + ( + "", + api_response, + ["squad-shifts-tnf"], + "Viewing datasets may take a while...\n[view_dataset]: The 'dataset_filename' argument must be a non-empty string and not None.", + False, + ), + ( + 123, + api_response, + ["squad-shifts-tnf"], + "Viewing datasets may take a while...\n[view_dataset]: The 'dataset_filename' argument must be a non-empty string and not None.", + False, + ), + ( + {}, + api_response, + ["squad-shifts-tnf"], + "Viewing datasets may take a while...\n[view_dataset]: The 'dataset_filename' argument must be a non-empty string and not None.", + False, + ), + ( + [], + api_response, + ["squad-shifts-tnf"], + "Viewing datasets may take a while...\n[view_dataset]: The 'dataset_filename' argument must be a non-empty string and not None.", + False, + ), + ( + (), + api_response, + ["squad-shifts-tnf"], + "Viewing datasets may take a while...\n[view_dataset]: The 'dataset_filename' argument must be a non-empty string and not None.", + False, + ), + ( + True, + api_response, + ["squad-shifts-tnf"], + "Viewing datasets may take a while...\n[view_dataset]: The 'dataset_filename' argument must be a non-empty string and not None.", + False, + ), + # Exception case: api_get_all_datasets raises an exception + ( + "squad-shifts-tnf", + api_response, + ["squad-shifts-tnf"], + "Viewing datasets may take a while...\n[view_dataset]: An error has occurred while reading the datasets.", + False, + ), + # Exception case: api_get_all_datasets_name raises an exception + ( + "squad-shifts-tnf", + api_response, + ["squad-shifts-tnf"], + "Viewing datasets may take a while...\n[view_dataset]: An error has occurred while reading the dataset names.", + False, + ), + ], + ) + @patch("moonshot.integrations.cli.benchmark.datasets.api_get_all_datasets") + @patch("moonshot.integrations.cli.benchmark.datasets.api_get_all_datasets_name") + @patch("moonshot.integrations.cli.benchmark.datasets._display_datasets") + def test_view_dataset( + self, + mock_display_datasets, + mock_api_get_all_datasets_name, + mock_api_get_all_datasets, + dataset_filename, + api_response, + api_name_response, + expected_log, + to_be_called, + capsys, + ): + if "error" in expected_log: + if "reading the datasets." in expected_log: + mock_api_get_all_datasets.side_effect = Exception( + "An error has occurred while reading the datasets." + ) + if "reading the dataset names." in expected_log: + mock_api_get_all_datasets_name.side_effect = Exception( + "An error has occurred while reading the dataset names." + ) + else: + mock_api_get_all_datasets.return_value = api_response + mock_api_get_all_datasets_name.return_value = api_name_response + + class Args: + pass + + args = Args() + args.dataset_filename = dataset_filename + + view_dataset(args) + + captured = capsys.readouterr() + assert expected_log == captured.out.strip() + + if to_be_called: + mock_display_datasets.assert_called_once_with(api_response) + else: + mock_display_datasets.assert_not_called() + + # ------------------------------------------------------------------------------ + # Test delete_dataset functionality + # ------------------------------------------------------------------------------ + @pytest.mark.parametrize( + "dataset, expected_log, to_be_called", + [ + # Valid case + ("Dataset 1", "[delete_dataset]: Dataset deleted.", True), + # Invalid case - dataset + ( + "", + "[delete_dataset]: The 'dataset' argument must be a non-empty string and not None.", + False, + ), + ( + None, + "[delete_dataset]: The 'dataset' argument must be a non-empty string and not None.", + False, + ), + ( + 123, + "[delete_dataset]: The 'dataset' argument must be a non-empty string and not None.", + False, + ), + ( + {}, + "[delete_dataset]: The 'dataset' argument must be a non-empty string and not None.", + False, + ), + ( + [], + "[delete_dataset]: The 'dataset' argument must be a non-empty string and not None.", + False, + ), + ( + (), + "[delete_dataset]: The 'dataset' argument must be a non-empty string and not None.", + False, + ), + ( + True, + "[delete_dataset]: The 'dataset' argument must be a non-empty string and not None.", + False, + ), + ], + ) + @patch("moonshot.integrations.cli.benchmark.datasets.api_delete_dataset") + def test_delete_dataset( + self, mock_api_delete_dataset, capsys, dataset, expected_log, to_be_called + ): + class Args: + pass + + args = Args() + args.dataset = dataset + + with patch( + "moonshot.integrations.cli.benchmark.datasets.console.input", + return_value="y", + ): + with patch("moonshot.integrations.cli.benchmark.datasets.console.print"): + delete_dataset(args) + + captured = capsys.readouterr() + assert expected_log == captured.out.strip() + + if to_be_called: + mock_api_delete_dataset.assert_called_once_with(args.dataset) + else: + mock_api_delete_dataset.assert_not_called() + + @patch( + "moonshot.integrations.cli.benchmark.datasets.console.input", return_value="y" + ) + @patch("moonshot.integrations.cli.benchmark.datasets.api_delete_dataset") + def test_delete_dataset_confirm_yes(self, mock_delete, mock_input): + args = MagicMock() + args.dataset = "test_dataset_id" + + delete_dataset(args) + + mock_input.assert_called_once_with( + "[bold red]Are you sure you want to delete the dataset (y/N)? [/]" + ) + mock_delete.assert_called_once_with("test_dataset_id") + + @patch( + "moonshot.integrations.cli.benchmark.datasets.console.input", return_value="n" + ) + @patch("moonshot.integrations.cli.benchmark.datasets.api_delete_dataset") + def test_delete_dataset_confirm_no(self, mock_delete, mock_input): + args = MagicMock() + args.dataset = "test_dataset_id" + + delete_dataset(args) + + mock_input.assert_called_once_with( + "[bold red]Are you sure you want to delete the dataset (y/N)? [/]" + ) + mock_delete.assert_not_called() + + @patch( + "moonshot.integrations.cli.benchmark.datasets.console.input", return_value="n" + ) + @patch("moonshot.integrations.cli.benchmark.datasets.console.print") + @patch("moonshot.integrations.cli.benchmark.datasets.api_delete_dataset") + def test_delete_dataset_cancelled_output(self, mock_delete, mock_print, mock_input): + args = MagicMock() + args.dataset = "test_dataset_id" + + delete_dataset(args) + + mock_input.assert_called_once_with( + "[bold red]Are you sure you want to delete the dataset (y/N)? [/]" + ) + mock_print.assert_called_once_with( + "[bold yellow]Dataset deletion cancelled.[/]" + ) + mock_delete.assert_not_called() diff --git a/tests/unit-tests/cli/test_metrics.py b/tests/unit-tests/cli/test_metrics.py new file mode 100644 index 00000000..edeeca93 --- /dev/null +++ b/tests/unit-tests/cli/test_metrics.py @@ -0,0 +1,646 @@ +from unittest.mock import MagicMock, patch + +import pytest + +from moonshot.integrations.cli.benchmark.metrics import ( + delete_metric, + list_metrics, + view_metric, +) + + +class TestCollectionCliMetrics: + api_response = [ + { + "id": "bertscore", + "name": "BertScore", + "description": "Some description", + } + ] + api_response_pagination = [ + { + "id": "bertscore", + "name": "BertScore", + "description": "Some description", + "idx": 1, + } + ] + + @pytest.fixture(autouse=True) + def init(self): + # Perform tests + yield + + # ------------------------------------------------------------------------------ + # Test list_metrics functionality with non-mocked filter-data + # ------------------------------------------------------------------------------ + @pytest.mark.parametrize( + "find, pagination, api_response, expected_output, expected_log, to_be_called", + [ + # Valid cases + ( + None, + None, + api_response, + api_response, + "Listing metrics may take a while...", + True, + ), + # No metrics + ( + None, + None, + [], + None, + "Listing metrics may take a while...\nThere are no metrics found.", + False, + ), + ( + "bert", + None, + api_response, + api_response, + "Listing metrics may take a while...", + True, + ), + ( + None, + "(1, 1)", + api_response, + api_response_pagination, + "Listing metrics may take a while...", + True, + ), + ( + "Metrics", + "(1, 1)", + api_response, + None, + "Listing metrics may take a while...\nThere are no metrics found.", + False, + ), + # Invalid cases for find + ( + "", + None, + api_response, + None, + "Listing metrics may take a while...\n[list_metrics]: The 'find' argument must be a non-empty string and not None.", + False, + ), + ( + 99, + None, + api_response, + None, + "Listing metrics may take a while...\n[list_metrics]: The 'find' argument must be a non-empty string and not None.", + False, + ), + ( + {}, + None, + api_response, + None, + "Listing metrics may take a while...\n[list_metrics]: The 'find' argument must be a non-empty string and not None.", + False, + ), + ( + [], + None, + api_response, + None, + "Listing metrics may take a while...\n[list_metrics]: The 'find' argument must be a non-empty string and not None.", + False, + ), + ( + (), + None, + api_response, + None, + "Listing metrics may take a while...\n[list_metrics]: The 'find' argument must be a non-empty string and not None.", + False, + ), + ( + True, + None, + api_response, + None, + "Listing metrics may take a while...\n[list_metrics]: The 'find' argument must be a non-empty string and not None.", + False, + ), + # Invalid cases for pagination + ( + None, + "", + api_response, + None, + "Listing metrics may take a while...\n[list_metrics]: The 'pagination' argument must be a non-empty string and not None.", + False, + ), + ( + None, + 99, + api_response, + None, + "Listing metrics may take a while...\n[list_metrics]: The 'pagination' argument must be a non-empty string and not None.", + False, + ), + ( + None, + {}, + api_response, + None, + "Listing metrics may take a while...\n[list_metrics]: The 'pagination' argument must be a non-empty string and not None.", + False, + ), + ( + None, + [], + api_response, + None, + "Listing metrics may take a while...\n[list_metrics]: The 'pagination' argument must be a non-empty string and not None.", + False, + ), + ( + None, + (), + api_response, + None, + "Listing metrics may take a while...\n[list_metrics]: The 'pagination' argument must be a non-empty string and not None.", + False, + ), + ( + None, + True, + api_response, + None, + "Listing metrics may take a while...\n[list_metrics]: The 'pagination' argument must be a non-empty string and not None.", + False, + ), + ( + None, + True, + api_response, + None, + "Listing metrics may take a while...\n[list_metrics]: The 'pagination' argument must be a non-empty string and not None.", + False, + ), + ( + None, + "(1, 'a')", + api_response, + None, + "Listing metrics may take a while...\n[list_metrics]: The 'pagination' argument must be a tuple of two integers.", + False, + ), + ( + None, + "(1, 2, 3)", + api_response, + None, + "Listing metrics may take a while...\n[list_metrics]: The 'pagination' argument must be a tuple of two integers.", + False, + ), + ( + None, + "(1, )", + api_response, + None, + "Listing metrics may take a while...\n[list_metrics]: The 'pagination' argument must be a tuple of two integers.", + False, + ), + ( + None, + "(0, 1)", + api_response, + None, + "Listing metrics may take a while...\n[list_metrics]: Invalid page number or page size. Page number and page size should start from 1.", + False, + ), + ( + None, + "(1, 0)", + api_response, + None, + "Listing metrics may take a while...\n[list_metrics]: Invalid page number or page size. Page number and page size should start from 1.", + False, + ), + ( + None, + "(0, 0)", + api_response, + None, + "Listing metrics may take a while...\n[list_metrics]: Invalid page number or page size. Page number and page size should start from 1.", + False, + ), + ( + None, + "(1, -1)", + api_response, + None, + "Listing metrics may take a while...\n[list_metrics]: Invalid page number or page size. Page number and page size should start from 1.", + False, + ), + ( + None, + "(-1, 1)", + api_response, + None, + "Listing metrics may take a while...\n[list_metrics]: Invalid page number or page size. Page number and page size should start from 1.", + False, + ), + ( + None, + "(-1, -1)", + api_response, + None, + "Listing metrics may take a while...\n[list_metrics]: Invalid page number or page size. Page number and page size should start from 1.", + False, + ), + # Exception case + ( + None, + None, + api_response, + None, + "Listing metrics may take a while...\n[list_metrics]: An error has occurred while listing metrics.", + False, + ), + ], + ) + @patch("moonshot.integrations.cli.benchmark.metrics.api_get_all_metric") + @patch("moonshot.integrations.cli.benchmark.metrics._display_metrics") + def test_list_metrics( + self, + mock_display_metrics, + mock_api_get_all_metrics, + find, + pagination, + api_response, + expected_output, + expected_log, + to_be_called, + capsys, + ): + if "error" in expected_log: + mock_api_get_all_metrics.side_effect = Exception( + "An error has occurred while listing metrics." + ) + else: + mock_api_get_all_metrics.return_value = api_response + + class Args: + pass + + args = Args() + args.find = find + args.pagination = pagination + + result = list_metrics(args) + + captured = capsys.readouterr() + assert expected_log == captured.out.strip() + assert result == expected_output + + if to_be_called: + mock_display_metrics.assert_called_once_with(api_response) + else: + mock_display_metrics.assert_not_called() + + # ------------------------------------------------------------------------------ + # Test list_metrics functionality with mocked filter-data + # ------------------------------------------------------------------------------ + @pytest.mark.parametrize( + "find, pagination, api_response, filtered_response, expected_output, expected_log, to_be_called", + [ + ( + None, + None, + api_response, + api_response_pagination, + api_response_pagination, + "Listing metrics may take a while...", + True, + ), + ( + "squad", + None, + api_response, + api_response_pagination, + api_response_pagination, + "Listing metrics may take a while...", + True, + ), + ( + None, + "(0, 1)", + api_response, + api_response_pagination, + api_response_pagination, + "Listing metrics may take a while...", + True, + ), + # Case where filtered_response is None + ( + None, + None, + api_response, + None, + None, + "Listing metrics may take a while...\nThere are no metrics found.", + False, + ), + # Case where filtered_response is an empty list + ( + None, + None, + api_response, + [], + None, + "Listing metrics may take a while...\nThere are no metrics found.", + False, + ), + ], + ) + @patch("moonshot.integrations.cli.benchmark.metrics.api_get_all_metric") + @patch("moonshot.integrations.cli.benchmark.metrics._display_metrics") + @patch("moonshot.integrations.cli.benchmark.metrics.filter_data") + def test_list_metrics_filtered( + self, + mock_filter_data, + mock_display_metrics, + mock_api_get_all_metrics, + find, + pagination, + api_response, + filtered_response, + expected_output, + expected_log, + to_be_called, + capsys, + ): + mock_api_get_all_metrics.return_value = api_response + mock_filter_data.return_value = filtered_response + + class Args: + pass + + args = Args() + args.find = find + args.pagination = pagination + + result = list_metrics(args) + + captured = capsys.readouterr() + assert expected_log == captured.out.strip() + assert result == expected_output + + if to_be_called: + mock_display_metrics.assert_called_once_with(filtered_response) + else: + mock_display_metrics.assert_not_called() + + # ------------------------------------------------------------------------------ + # Test view_metric functionality + # ------------------------------------------------------------------------------ + @pytest.mark.parametrize( + "metric_filename, api_response, api_name_response, expected_log, to_be_called", + [ + # Valid case + ( + "bertscore", + api_response, + ["bertscore"], + "Viewing metrics may take a while...", + True, + ), + # Invalid case: metric_filename is None + ( + None, + api_response, + ["bertscore"], + "Viewing metrics may take a while...\n[view_metric]: The 'metric_filename' argument must be a non-empty string and not None.", + False, + ), + # Invalid case: metric_filename is not a string + ( + "", + api_response, + ["bertscore"], + "Viewing metrics may take a while...\n[view_metric]: The 'metric_filename' argument must be a non-empty string and not None.", + False, + ), + ( + 123, + api_response, + ["bertscore"], + "Viewing metrics may take a while...\n[view_metric]: The 'metric_filename' argument must be a non-empty string and not None.", + False, + ), + ( + {}, + api_response, + ["bertscore"], + "Viewing metrics may take a while...\n[view_metric]: The 'metric_filename' argument must be a non-empty string and not None.", + False, + ), + ( + [], + api_response, + ["bertscore"], + "Viewing metrics may take a while...\n[view_metric]: The 'metric_filename' argument must be a non-empty string and not None.", + False, + ), + ( + (), + api_response, + ["bertscore"], + "Viewing metrics may take a while...\n[view_metric]: The 'metric_filename' argument must be a non-empty string and not None.", + False, + ), + ( + True, + api_response, + ["bertscore"], + "Viewing metrics may take a while...\n[view_metric]: The 'metric_filename' argument must be a non-empty string and not None.", + False, + ), + # Exception case: api_get_all_metrics raises an exception + ( + "bertscore", + api_response, + ["bertscore"], + "Viewing metrics may take a while...\n[view_metric]: An error has occurred while reading the metrics.", + False, + ), + # Exception case: api_get_all_metric_name raises an exception + ( + "bertscore", + api_response, + ["bertscore"], + "Viewing metrics may take a while...\n[view_metric]: An error has occurred while reading the metric names.", + False, + ), + ], + ) + @patch("moonshot.integrations.cli.benchmark.metrics.api_get_all_metric") + @patch("moonshot.integrations.cli.benchmark.metrics.api_get_all_metric_name") + @patch("moonshot.integrations.cli.benchmark.metrics._display_metrics") + def test_view_metric( + self, + mock_display_metrics, + mock_api_get_all_metric_name, + mock_api_get_all_metrics, + metric_filename, + api_response, + api_name_response, + expected_log, + to_be_called, + capsys, + ): + if "error" in expected_log: + if "reading the metrics." in expected_log: + mock_api_get_all_metrics.side_effect = Exception( + "An error has occurred while reading the metrics." + ) + if "reading the metric names." in expected_log: + mock_api_get_all_metric_name.side_effect = Exception( + "An error has occurred while reading the metric names." + ) + else: + mock_api_get_all_metrics.return_value = api_response + mock_api_get_all_metric_name.return_value = api_name_response + + class Args: + pass + + args = Args() + args.metric_filename = metric_filename + + view_metric(args) + + captured = capsys.readouterr() + assert expected_log == captured.out.strip() + + if to_be_called: + mock_display_metrics.assert_called_once_with(api_response) + else: + mock_display_metrics.assert_not_called() + + # ------------------------------------------------------------------------------ + # Test delete_metric functionality + # ------------------------------------------------------------------------------ + @pytest.mark.parametrize( + "metric, expected_log, to_be_called", + [ + # Valid case + ("Metric 1", "[delete_metric]: Metric deleted.", True), + # Invalid case - metric + ( + "", + "[delete_metric]: The 'metric' argument must be a non-empty string and not None.", + False, + ), + ( + None, + "[delete_metric]: The 'metric' argument must be a non-empty string and not None.", + False, + ), + ( + 123, + "[delete_metric]: The 'metric' argument must be a non-empty string and not None.", + False, + ), + ( + {}, + "[delete_metric]: The 'metric' argument must be a non-empty string and not None.", + False, + ), + ( + [], + "[delete_metric]: The 'metric' argument must be a non-empty string and not None.", + False, + ), + ( + (), + "[delete_metric]: The 'metric' argument must be a non-empty string and not None.", + False, + ), + ( + True, + "[delete_metric]: The 'metric' argument must be a non-empty string and not None.", + False, + ), + ], + ) + @patch("moonshot.integrations.cli.benchmark.metrics.api_delete_metric") + def test_delete_metric( + self, mock_api_delete_metric, capsys, metric, expected_log, to_be_called + ): + class Args: + pass + + args = Args() + args.metric = metric + + with patch( + "moonshot.integrations.cli.benchmark.metrics.console.input", + return_value="y", + ): + with patch("moonshot.integrations.cli.benchmark.metrics.console.print"): + delete_metric(args) + + captured = capsys.readouterr() + assert expected_log == captured.out.strip() + + if to_be_called: + mock_api_delete_metric.assert_called_once_with(args.metric) + else: + mock_api_delete_metric.assert_not_called() + + @patch( + "moonshot.integrations.cli.benchmark.metrics.console.input", return_value="y" + ) + @patch("moonshot.integrations.cli.benchmark.metrics.api_delete_metric") + def test_delete_metric_confirm_yes(self, mock_delete, mock_input): + args = MagicMock() + args.metric = "test_metric_id" + + delete_metric(args) + + mock_input.assert_called_once_with( + "[bold red]Are you sure you want to delete the metric (y/N)? [/]" + ) + mock_delete.assert_called_once_with("test_metric_id") + + @patch( + "moonshot.integrations.cli.benchmark.metrics.console.input", return_value="n" + ) + @patch("moonshot.integrations.cli.benchmark.metrics.api_delete_metric") + def test_delete_metric_confirm_no(self, mock_delete, mock_input): + args = MagicMock() + args.metric = "test_metric_id" + + delete_metric(args) + + mock_input.assert_called_once_with( + "[bold red]Are you sure you want to delete the metric (y/N)? [/]" + ) + mock_delete.assert_not_called() + + @patch( + "moonshot.integrations.cli.benchmark.metrics.console.input", return_value="n" + ) + @patch("moonshot.integrations.cli.benchmark.metrics.console.print") + @patch("moonshot.integrations.cli.benchmark.metrics.api_delete_metric") + def test_delete_metric_cancelled_output(self, mock_delete, mock_print, mock_input): + args = MagicMock() + args.metric = "test_metric_id" + + delete_metric(args) + + mock_input.assert_called_once_with( + "[bold red]Are you sure you want to delete the metric (y/N)? [/]" + ) + mock_print.assert_called_once_with("[bold yellow]Metric deletion cancelled.[/]") + mock_delete.assert_not_called() diff --git a/tests/unit-tests/cli/test_recipe.py b/tests/unit-tests/cli/test_recipe.py new file mode 100644 index 00000000..eccea938 --- /dev/null +++ b/tests/unit-tests/cli/test_recipe.py @@ -0,0 +1,2489 @@ +from ast import literal_eval +from unittest.mock import AsyncMock, MagicMock, patch + +import pytest + +from moonshot.integrations.cli.benchmark.recipe import ( + add_recipe, + delete_recipe, + list_recipes, + run_recipe, + update_recipe, + view_recipe, +) + + +class TestCollectionCliRecipe: + api_response = [ + { + "id": "realtime-qa", + "name": "RealtimeQA", + "description": "Some description.", + "tags": ["Hallucination"], + "categories": ["Trust & Safety"], + "datasets": ["realtimeqa-past"], + "prompt_templates": [], + "metrics": ["exactstrmatch"], + "grading_scale": {"A": [80, 100]}, + "stats": { + "num_of_tags": 1, + "num_of_datasets": 1, + "num_of_prompt_templates": 0, + "num_of_metrics": 1, + "num_of_datasets_prompts": {"realtimeqa-past": 50}, + }, + } + ] + api_response_pagination = [ + { + "id": "realtime-qa", + "name": "RealtimeQA", + "description": "Some description.", + "tags": ["Hallucination"], + "categories": ["Trust & Safety"], + "datasets": ["realtimeqa-past"], + "prompt_templates": [], + "metrics": ["exactstrmatch"], + "grading_scale": {"A": [80, 100]}, + "stats": { + "num_of_tags": 1, + "num_of_datasets": 1, + "num_of_prompt_templates": 0, + "num_of_metrics": 1, + "num_of_datasets_prompts": {"realtimeqa-past": 50}, + }, + "idx": 1, + } + ] + + @pytest.fixture(autouse=True) + def init(self): + # Perform tests + yield + + # ------------------------------------------------------------------------------ + # Test add_recipe functionality + # ------------------------------------------------------------------------------ + @pytest.mark.parametrize( + "name, description, tags, categories, datasets, prompt_templates, metrics, grading_scale, expected_output, expected_call", + [ + # Valid case + ( + "New Recipe ID", + "This is a test recipe.", + "['tag1']", + "['category1']", + "['dataset1','dataset2']", + "['prompt_template1','prompt_template2']", + "['metrics1', 'metrics2']", + "{'A': [80,100], 'B': [60,79]}", + "[add_recipe]: Recipe (new_recipe_id) created.", + True, + ), + # Invalid case for name + ( + None, + "This is a test recipe.", + "['tag1']", + "['category1']", + "['dataset1','dataset2']", + "['prompt_template1','prompt_template2']", + "['metrics1', 'metrics2']", + "{'A': [80,100], 'B': [60,79]}", + "[add_recipe]: The 'name' argument must be a non-empty string and not None.", + False, + ), + ( + "", + "This is a test recipe.", + "['tag1']", + "['category1']", + "['dataset1','dataset2']", + "['prompt_template1','prompt_template2']", + "['metrics1', 'metrics2']", + "{'A': [80,100], 'B': [60,79]}", + "[add_recipe]: The 'name' argument must be a non-empty string and not None.", + False, + ), + ( + 99, + "This is a test recipe.", + "['tag1']", + "['category1']", + "['dataset1','dataset2']", + "['prompt_template1','prompt_template2']", + "['metrics1', 'metrics2']", + "{'A': [80,100], 'B': [60,79]}", + "[add_recipe]: The 'name' argument must be a non-empty string and not None.", + False, + ), + ( + {}, + "This is a test recipe.", + "['tag1']", + "['category1']", + "['dataset1','dataset2']", + "['prompt_template1','prompt_template2']", + "['metrics1', 'metrics2']", + "{'A': [80,100], 'B': [60,79]}", + "[add_recipe]: The 'name' argument must be a non-empty string and not None.", + False, + ), + ( + [], + "This is a test recipe.", + "['tag1']", + "['category1']", + "['dataset1','dataset2']", + "['prompt_template1','prompt_template2']", + "['metrics1', 'metrics2']", + "{'A': [80,100], 'B': [60,79]}", + "[add_recipe]: The 'name' argument must be a non-empty string and not None.", + False, + ), + ( + (), + "This is a test recipe.", + "['tag1']", + "['category1']", + "['dataset1','dataset2']", + "['prompt_template1','prompt_template2']", + "['metrics1', 'metrics2']", + "{'A': [80,100], 'B': [60,79]}", + "[add_recipe]: The 'name' argument must be a non-empty string and not None.", + False, + ), + ( + True, + "This is a test recipe.", + "['tag1']", + "['category1']", + "['dataset1','dataset2']", + "['prompt_template1','prompt_template2']", + "['metrics1', 'metrics2']", + "{'A': [80,100], 'B': [60,79]}", + "[add_recipe]: The 'name' argument must be a non-empty string and not None.", + False, + ), + # Invalid case for description + ( + "New Recipe ID", + None, + "['tag1']", + "['category1']", + "['dataset1','dataset2']", + "['prompt_template1','prompt_template2']", + "['metrics1', 'metrics2']", + "{'A': [80,100], 'B': [60,79]}", + "[add_recipe]: The 'description' argument must be a non-empty string and not None.", + False, + ), + ( + "New Recipe ID", + "", + "['tag1']", + "['category1']", + "['dataset1','dataset2']", + "['prompt_template1','prompt_template2']", + "['metrics1', 'metrics2']", + "{'A': [80,100], 'B': [60,79]}", + "[add_recipe]: The 'description' argument must be a non-empty string and not None.", + False, + ), + ( + "New Recipe ID", + 99, + "['tag1']", + "['category1']", + "['dataset1','dataset2']", + "['prompt_template1','prompt_template2']", + "['metrics1', 'metrics2']", + "{'A': [80,100], 'B': [60,79]}", + "[add_recipe]: The 'description' argument must be a non-empty string and not None.", + False, + ), + ( + "New Recipe ID", + {}, + "['tag1']", + "['category1']", + "['dataset1','dataset2']", + "['prompt_template1','prompt_template2']", + "['metrics1', 'metrics2']", + "{'A': [80,100], 'B': [60,79]}", + "[add_recipe]: The 'description' argument must be a non-empty string and not None.", + False, + ), + ( + "New Recipe ID", + [], + "['tag1']", + "['category1']", + "['dataset1','dataset2']", + "['prompt_template1','prompt_template2']", + "['metrics1', 'metrics2']", + "{'A': [80,100], 'B': [60,79]}", + "[add_recipe]: The 'description' argument must be a non-empty string and not None.", + False, + ), + ( + "New Recipe ID", + (), + "['tag1']", + "['category1']", + "['dataset1','dataset2']", + "['prompt_template1','prompt_template2']", + "['metrics1', 'metrics2']", + "{'A': [80,100], 'B': [60,79]}", + "[add_recipe]: The 'description' argument must be a non-empty string and not None.", + False, + ), + ( + "New Recipe ID", + True, + "['tag1']", + "['category1']", + "['dataset1','dataset2']", + "['prompt_template1','prompt_template2']", + "['metrics1', 'metrics2']", + "{'A': [80,100], 'B': [60,79]}", + "[add_recipe]: The 'description' argument must be a non-empty string and not None.", + False, + ), + # Invalid case for tags - not a list of strings + ( + "New Recipe ID", + "This is a test recipe.", + "None", + "['category1']", + "['dataset1','dataset2']", + "['prompt_template1','prompt_template2']", + "['metrics1', 'metrics2']", + "{'A': [80,100], 'B': [60,79]}", + "[add_recipe]: The 'tags' argument must be a list of strings after evaluation.", + False, + ), + ( + "New Recipe ID", + "This is a test recipe.", + "[123, 'recipe2']", + "['category1']", + "['dataset1','dataset2']", + "['prompt_template1','prompt_template2']", + "['metrics1', 'metrics2']", + "{'A': [80,100], 'B': [60,79]}", + "[add_recipe]: The 'tags' argument must be a list of strings after evaluation.", + False, + ), + # Invalid case for tags + ( + "New Recipe ID", + "This is a test recipe.", + None, + "['category1']", + "['dataset1','dataset2']", + "['prompt_template1','prompt_template2']", + "['metrics1', 'metrics2']", + "{'A': [80,100], 'B': [60,79]}", + "[add_recipe]: The 'tags' argument must be a non-empty string and not None.", + False, + ), + ( + "New Recipe ID", + "This is a test recipe.", + "", + "['category1']", + "['dataset1','dataset2']", + "['prompt_template1','prompt_template2']", + "['metrics1', 'metrics2']", + "{'A': [80,100], 'B': [60,79]}", + "[add_recipe]: The 'tags' argument must be a non-empty string and not None.", + False, + ), + ( + "New Recipe ID", + "This is a test recipe.", + 99, + "['category1']", + "['dataset1','dataset2']", + "['prompt_template1','prompt_template2']", + "['metrics1', 'metrics2']", + "{'A': [80,100], 'B': [60,79]}", + "[add_recipe]: The 'tags' argument must be a non-empty string and not None.", + False, + ), + ( + "New Recipe ID", + "This is a test recipe.", + {}, + "['category1']", + "['dataset1','dataset2']", + "['prompt_template1','prompt_template2']", + "['metrics1', 'metrics2']", + "{'A': [80,100], 'B': [60,79]}", + "[add_recipe]: The 'tags' argument must be a non-empty string and not None.", + False, + ), + ( + "New Recipe ID", + "This is a test recipe.", + [], + "['category1']", + "['dataset1','dataset2']", + "['prompt_template1','prompt_template2']", + "['metrics1', 'metrics2']", + "{'A': [80,100], 'B': [60,79]}", + "[add_recipe]: The 'tags' argument must be a non-empty string and not None.", + False, + ), + ( + "New Recipe ID", + "This is a test recipe.", + (), + "['category1']", + "['dataset1','dataset2']", + "['prompt_template1','prompt_template2']", + "['metrics1', 'metrics2']", + "{'A': [80,100], 'B': [60,79]}", + "[add_recipe]: The 'tags' argument must be a non-empty string and not None.", + False, + ), + ( + "New Recipe ID", + "This is a test recipe.", + True, + "['category1']", + "['dataset1','dataset2']", + "['prompt_template1','prompt_template2']", + "['metrics1', 'metrics2']", + "{'A': [80,100], 'B': [60,79]}", + "[add_recipe]: The 'tags' argument must be a non-empty string and not None.", + False, + ), + # Invalid case for categories - not a list of strings + ( + "New Recipe ID", + "This is a test recipe.", + "['tag1']", + "None", + "['dataset1','dataset2']", + "['prompt_template1','prompt_template2']", + "['metrics1', 'metrics2']", + "{'A': [80,100], 'B': [60,79]}", + "[add_recipe]: The 'categories' argument must be a list of strings after evaluation.", + False, + ), + ( + "New Recipe ID", + "This is a test recipe.", + "['tag1']", + "[123, 'category2']", + "['dataset1','dataset2']", + "['prompt_template1','prompt_template2']", + "['metrics1', 'metrics2']", + "{'A': [80,100], 'B': [60,79]}", + "[add_recipe]: The 'categories' argument must be a list of strings after evaluation.", + False, + ), + # Invalid case for categories + ( + "New Recipe ID", + "This is a test recipe.", + "['tag1']", + None, + "['dataset1','dataset2']", + "['prompt_template1','prompt_template2']", + "['metrics1', 'metrics2']", + "{'A': [80,100], 'B': [60,79]}", + "[add_recipe]: The 'categories' argument must be a non-empty string and not None.", + False, + ), + ( + "New Recipe ID", + "This is a test recipe.", + "['tag1']", + "", + "['dataset1','dataset2']", + "['prompt_template1','prompt_template2']", + "['metrics1', 'metrics2']", + "{'A': [80,100], 'B': [60,79]}", + "[add_recipe]: The 'categories' argument must be a non-empty string and not None.", + False, + ), + ( + "New Recipe ID", + "This is a test recipe.", + "['tag1']", + 99, + "['dataset1','dataset2']", + "['prompt_template1','prompt_template2']", + "['metrics1', 'metrics2']", + "{'A': [80,100], 'B': [60,79]}", + "[add_recipe]: The 'categories' argument must be a non-empty string and not None.", + False, + ), + ( + "New Recipe ID", + "This is a test recipe.", + "['tag1']", + {}, + "['dataset1','dataset2']", + "['prompt_template1','prompt_template2']", + "['metrics1', 'metrics2']", + "{'A': [80,100], 'B': [60,79]}", + "[add_recipe]: The 'categories' argument must be a non-empty string and not None.", + False, + ), + ( + "New Recipe ID", + "This is a test recipe.", + "['tag1']", + [], + "['dataset1','dataset2']", + "['prompt_template1','prompt_template2']", + "['metrics1', 'metrics2']", + "{'A': [80,100], 'B': [60,79]}", + "[add_recipe]: The 'categories' argument must be a non-empty string and not None.", + False, + ), + ( + "New Recipe ID", + "This is a test recipe.", + "['tag1']", + (), + "['dataset1','dataset2']", + "['prompt_template1','prompt_template2']", + "['metrics1', 'metrics2']", + "{'A': [80,100], 'B': [60,79]}", + "[add_recipe]: The 'categories' argument must be a non-empty string and not None.", + False, + ), + ( + "New Recipe ID", + "This is a test recipe.", + "['tag1']", + True, + "['dataset1','dataset2']", + "['prompt_template1','prompt_template2']", + "['metrics1', 'metrics2']", + "{'A': [80,100], 'B': [60,79]}", + "[add_recipe]: The 'categories' argument must be a non-empty string and not None.", + False, + ), + # Invalid case for datasets - not a list of strings + ( + "New Recipe ID", + "This is a test recipe.", + "['tag1']", + "['category1']", + "None", + "['prompt_template1','prompt_template2']", + "['metrics1', 'metrics2']", + "{'A': [80,100], 'B': [60,79]}", + "[add_recipe]: The 'datasets' argument must be a list of strings after evaluation.", + False, + ), + ( + "New Recipe ID", + "This is a test recipe.", + "['tag1']", + "['category1']", + "[123, 'dataset2']", + "['prompt_template1','prompt_template2']", + "['metrics1', 'metrics2']", + "{'A': [80,100], 'B': [60,79]}", + "[add_recipe]: The 'datasets' argument must be a list of strings after evaluation.", + False, + ), + # Invalid case for datasets + ( + "New Recipe ID", + "This is a test recipe.", + "['tag1']", + "['category1']", + None, + "['prompt_template1','prompt_template2']", + "['metrics1', 'metrics2']", + "{'A': [80,100], 'B': [60,79]}", + "[add_recipe]: The 'datasets' argument must be a non-empty string and not None.", + False, + ), + ( + "New Recipe ID", + "This is a test recipe.", + "['tag1']", + "['category1']", + "", + "['prompt_template1','prompt_template2']", + "['metrics1', 'metrics2']", + "{'A': [80,100], 'B': [60,79]}", + "[add_recipe]: The 'datasets' argument must be a non-empty string and not None.", + False, + ), + ( + "New Recipe ID", + "This is a test recipe.", + "['tag1']", + "['category1']", + 99, + "['prompt_template1','prompt_template2']", + "['metrics1', 'metrics2']", + "{'A': [80,100], 'B': [60,79]}", + "[add_recipe]: The 'datasets' argument must be a non-empty string and not None.", + False, + ), + ( + "New Recipe ID", + "This is a test recipe.", + "['tag1']", + "['category1']", + {}, + "['prompt_template1','prompt_template2']", + "['metrics1', 'metrics2']", + "{'A': [80,100], 'B': [60,79]}", + "[add_recipe]: The 'datasets' argument must be a non-empty string and not None.", + False, + ), + ( + "New Recipe ID", + "This is a test recipe.", + "['tag1']", + "['category1']", + [], + "['prompt_template1','prompt_template2']", + "['metrics1', 'metrics2']", + "{'A': [80,100], 'B': [60,79]}", + "[add_recipe]: The 'datasets' argument must be a non-empty string and not None.", + False, + ), + ( + "New Recipe ID", + "This is a test recipe.", + "['tag1']", + "['category1']", + (), + "['prompt_template1','prompt_template2']", + "['metrics1', 'metrics2']", + "{'A': [80,100], 'B': [60,79]}", + "[add_recipe]: The 'datasets' argument must be a non-empty string and not None.", + False, + ), + ( + "New Recipe ID", + "This is a test recipe.", + "['tag1']", + "['category1']", + True, + "['prompt_template1','prompt_template2']", + "['metrics1', 'metrics2']", + "{'A': [80,100], 'B': [60,79]}", + "[add_recipe]: The 'datasets' argument must be a non-empty string and not None.", + False, + ), + # Invalid case for prompt_templates - not a list of strings + ( + "New Recipe ID", + "This is a test recipe.", + "['tag1']", + "['category1']", + "['dataset1','dataset2']", + "None", + "['metrics1', 'metrics2']", + "{'A': [80,100], 'B': [60,79]}", + "[add_recipe]: The 'prompt_templates' argument must be a list of strings after evaluation.", + False, + ), + ( + "New Recipe ID", + "This is a test recipe.", + "['tag1']", + "['category1']", + "['dataset1','dataset2']", + "[123, 'template2']", + "['metrics1', 'metrics2']", + "{'A': [80,100], 'B': [60,79]}", + "[add_recipe]: The 'prompt_templates' argument must be a list of strings after evaluation.", + False, + ), + # Invalid case for prompt_templates + ( + "New Recipe ID", + "This is a test recipe.", + "['tag1']", + "['category1']", + "['dataset1','dataset2']", + None, + "['metrics1', 'metrics2']", + "{'A': [80,100], 'B': [60,79]}", + "[add_recipe]: The 'prompt_templates' argument must be a non-empty string and not None.", + False, + ), + ( + "New Recipe ID", + "This is a test recipe.", + "['tag1']", + "['category1']", + "['dataset1','dataset2']", + "", + "['metrics1', 'metrics2']", + "{'A': [80,100], 'B': [60,79]}", + "[add_recipe]: The 'prompt_templates' argument must be a non-empty string and not None.", + False, + ), + ( + "New Recipe ID", + "This is a test recipe.", + "['tag1']", + "['category1']", + "['dataset1','dataset2']", + 99, + "['metrics1', 'metrics2']", + "{'A': [80,100], 'B': [60,79]}", + "[add_recipe]: The 'prompt_templates' argument must be a non-empty string and not None.", + False, + ), + ( + "New Recipe ID", + "This is a test recipe.", + "['tag1']", + "['category1']", + "['dataset1','dataset2']", + {}, + "['metrics1', 'metrics2']", + "{'A': [80,100], 'B': [60,79]}", + "[add_recipe]: The 'prompt_templates' argument must be a non-empty string and not None.", + False, + ), + ( + "New Recipe ID", + "This is a test recipe.", + "['tag1']", + "['category1']", + "['dataset1','dataset2']", + [], + "['metrics1', 'metrics2']", + "{'A': [80,100], 'B': [60,79]}", + "[add_recipe]: The 'prompt_templates' argument must be a non-empty string and not None.", + False, + ), + ( + "New Recipe ID", + "This is a test recipe.", + "['tag1']", + "['category1']", + "['dataset1','dataset2']", + (), + "['metrics1', 'metrics2']", + "{'A': [80,100], 'B': [60,79]}", + "[add_recipe]: The 'prompt_templates' argument must be a non-empty string and not None.", + False, + ), + ( + "New Recipe ID", + "This is a test recipe.", + "['tag1']", + "['category1']", + "['dataset1','dataset2']", + True, + "['metrics1', 'metrics2']", + "{'A': [80,100], 'B': [60,79]}", + "[add_recipe]: The 'prompt_templates' argument must be a non-empty string and not None.", + False, + ), + # Invalid case for metrics - not a list of strings + ( + "New Recipe ID", + "This is a test recipe.", + "['tag1']", + "['category1']", + "['dataset1','dataset2']", + "['prompt_template1','prompt_template2']", + "None", + "{'A': [80,100], 'B': [60,79]}", + "[add_recipe]: The 'metrics' argument must be a list of strings after evaluation.", + False, + ), + ( + "New Recipe ID", + "This is a test recipe.", + "['tag1']", + "['category1']", + "['dataset1','dataset2']", + "['prompt_template1','prompt_template2']", + "[123, 'metric2']", + "{'A': [80,100], 'B': [60,79]}", + "[add_recipe]: The 'metrics' argument must be a list of strings after evaluation.", + False, + ), + # Invalid case for metrics + ( + "New Recipe ID", + "This is a test recipe.", + "['tag1']", + "['category1']", + "['dataset1','dataset2']", + "['prompt_template1','prompt_template2']", + None, + "{'A': [80,100], 'B': [60,79]}", + "[add_recipe]: The 'metrics' argument must be a non-empty string and not None.", + False, + ), + ( + "New Recipe ID", + "This is a test recipe.", + "['tag1']", + "['category1']", + "['dataset1','dataset2']", + "['prompt_template1','prompt_template2']", + "", + "{'A': [80,100], 'B': [60,79]}", + "[add_recipe]: The 'metrics' argument must be a non-empty string and not None.", + False, + ), + ( + "New Recipe ID", + "This is a test recipe.", + "['tag1']", + "['category1']", + "['dataset1','dataset2']", + "['prompt_template1','prompt_template2']", + 99, + "{'A': [80,100], 'B': [60,79]}", + "[add_recipe]: The 'metrics' argument must be a non-empty string and not None.", + False, + ), + ( + "New Recipe ID", + "This is a test recipe.", + "['tag1']", + "['category1']", + "['dataset1','dataset2']", + "['prompt_template1','prompt_template2']", + {}, + "{'A': [80,100], 'B': [60,79]}", + "[add_recipe]: The 'metrics' argument must be a non-empty string and not None.", + False, + ), + ( + "New Recipe ID", + "This is a test recipe.", + "['tag1']", + "['category1']", + "['dataset1','dataset2']", + "['prompt_template1','prompt_template2']", + [], + "{'A': [80,100], 'B': [60,79]}", + "[add_recipe]: The 'metrics' argument must be a non-empty string and not None.", + False, + ), + ( + "New Recipe ID", + "This is a test recipe.", + "['tag1']", + "['category1']", + "['dataset1','dataset2']", + "['prompt_template1','prompt_template2']", + (), + "{'A': [80,100], 'B': [60,79]}", + "[add_recipe]: The 'metrics' argument must be a non-empty string and not None.", + False, + ), + ( + "New Recipe ID", + "This is a test recipe.", + "['tag1']", + "['category1']", + "['dataset1','dataset2']", + "['prompt_template1','prompt_template2']", + True, + "{'A': [80,100], 'B': [60,79]}", + "[add_recipe]: The 'metrics' argument must be a non-empty string and not None.", + False, + ), + # Invalid case for grading_scale - not a dictionary + ( + "New Recipe ID", + "This is a test recipe.", + "['tag1']", + "['category1']", + "['dataset1','dataset2']", + "['prompt_template1','prompt_template2']", + "['metrics1', 'metrics2']", + "None", + "[add_recipe]: The 'grading_scale' argument must be a dictionary after evaluation.", + False, + ), + ( + "New Recipe ID", + "This is a test recipe.", + "['tag1']", + "['category1']", + "['dataset1','dataset2']", + "['prompt_template1','prompt_template2']", + "['metrics1', 'metrics2']", + "[123, 'scale2']", + "[add_recipe]: The 'grading_scale' argument must be a dictionary after evaluation.", + False, + ), + # Invalid case for grading_scale + ( + "New Recipe ID", + "This is a test recipe.", + "['tag1']", + "['category1']", + "['dataset1','dataset2']", + "['prompt_template1','prompt_template2']", + "['metrics1', 'metrics2']", + None, + "[add_recipe]: The 'grading_scale' argument must be a non-empty string and not None.", + False, + ), + ( + "New Recipe ID", + "This is a test recipe.", + "['tag1']", + "['category1']", + "['dataset1','dataset2']", + "['prompt_template1','prompt_template2']", + "['metrics1', 'metrics2']", + "", + "[add_recipe]: The 'grading_scale' argument must be a non-empty string and not None.", + False, + ), + ( + "New Recipe ID", + "This is a test recipe.", + "['tag1']", + "['category1']", + "['dataset1','dataset2']", + "['prompt_template1','prompt_template2']", + "['metrics1', 'metrics2']", + 99, + "[add_recipe]: The 'grading_scale' argument must be a non-empty string and not None.", + False, + ), + ( + "New Recipe ID", + "This is a test recipe.", + "['tag1']", + "['category1']", + "['dataset1','dataset2']", + "['prompt_template1','prompt_template2']", + "['metrics1', 'metrics2']", + {}, + "[add_recipe]: The 'grading_scale' argument must be a non-empty string and not None.", + False, + ), + ( + "New Recipe ID", + "This is a test recipe.", + "['tag1']", + "['category1']", + "['dataset1','dataset2']", + "['prompt_template1','prompt_template2']", + "['metrics1', 'metrics2']", + [], + "[add_recipe]: The 'grading_scale' argument must be a non-empty string and not None.", + False, + ), + ( + "New Recipe ID", + "This is a test recipe.", + "['tag1']", + "['category1']", + "['dataset1','dataset2']", + "['prompt_template1','prompt_template2']", + "['metrics1', 'metrics2']", + (), + "[add_recipe]: The 'grading_scale' argument must be a non-empty string and not None.", + False, + ), + ( + "New Recipe ID", + "This is a test recipe.", + "['tag1']", + "['category1']", + "['dataset1','dataset2']", + "['prompt_template1','prompt_template2']", + "['metrics1', 'metrics2']", + True, + "[add_recipe]: The 'grading_scale' argument must be a non-empty string and not None.", + False, + ), + # Exception case + ( + "New Recipe ID", + "This is a test recipe.", + "['tag1']", + "['category1']", + "['dataset1','dataset2']", + "['prompt_template1','prompt_template2']", + "['metrics1', 'metrics2']", + "{'A': [80,100], 'B': [60,79]}", + "[add_recipe]: An error has occurred while creating recipe.", + True, + ), + ], + ) + @patch("moonshot.integrations.cli.benchmark.recipe.api_create_recipe") + def test_add_recipe( + self, + mock_api_create_recipe, + name, + description, + tags, + categories, + datasets, + prompt_templates, + metrics, + grading_scale, + expected_output, + expected_call, + capsys, + ): + if "error" in expected_output: + mock_api_create_recipe.side_effect = Exception( + "An error has occurred while creating recipe." + ) + else: + mock_api_create_recipe.return_value = "new_recipe_id" + + class Args: + pass + + args = Args() + args.name = name + args.description = description + args.tags = tags + args.categories = categories + args.datasets = datasets + args.prompt_templates = prompt_templates + args.metrics = metrics + args.grading_scale = grading_scale + + add_recipe(args) + + captured = capsys.readouterr() + assert expected_output == captured.out.strip() + + if expected_call: + mock_api_create_recipe.assert_called_once_with( + name, + description, + eval(tags), + eval(categories), + eval(datasets), + eval(prompt_templates), + eval(metrics), + eval(grading_scale), + ) + else: + mock_api_create_recipe.assert_not_called() + + # ------------------------------------------------------------------------------ + # Test list_recipes functionality with non-mocked filter-data + # ------------------------------------------------------------------------------ + @pytest.mark.parametrize( + "find, pagination, api_response, expected_output, expected_log, to_be_called", + [ + # Valid cases + ( + None, + None, + api_response, + api_response, + "", + True, + ), + # No recipes + (None, None, [], None, "There are no recipes found.", False), + ( + "realtime", + None, + api_response, + api_response, + "", + True, + ), + ( + None, + "(1, 1)", + api_response, + api_response_pagination, + "", + True, + ), + ("Recipe", "(1, 1)", [], None, "There are no recipes found.", False), + # Invalid cases for find + ( + "", + None, + api_response, + None, + "[list_recipes]: The 'find' argument must be a non-empty string and not None.", + False, + ), + ( + 99, + None, + api_response, + None, + "[list_recipes]: The 'find' argument must be a non-empty string and not None.", + False, + ), + ( + {}, + None, + api_response, + None, + "[list_recipes]: The 'find' argument must be a non-empty string and not None.", + False, + ), + ( + [], + None, + api_response, + None, + "[list_recipes]: The 'find' argument must be a non-empty string and not None.", + False, + ), + ( + (), + None, + api_response, + None, + "[list_recipes]: The 'find' argument must be a non-empty string and not None.", + False, + ), + ( + True, + None, + api_response, + None, + "[list_recipes]: The 'find' argument must be a non-empty string and not None.", + False, + ), + # Invalid cases for pagination + ( + None, + "", + api_response, + None, + "[list_recipes]: The 'pagination' argument must be a non-empty string and not None.", + False, + ), + ( + None, + 99, + api_response, + None, + "[list_recipes]: The 'pagination' argument must be a non-empty string and not None.", + False, + ), + ( + None, + {}, + api_response, + None, + "[list_recipes]: The 'pagination' argument must be a non-empty string and not None.", + False, + ), + ( + None, + [], + api_response, + None, + "[list_recipes]: The 'pagination' argument must be a non-empty string and not None.", + False, + ), + ( + None, + (), + api_response, + None, + "[list_recipes]: The 'pagination' argument must be a non-empty string and not None.", + False, + ), + ( + None, + True, + api_response, + None, + "[list_recipes]: The 'pagination' argument must be a non-empty string and not None.", + False, + ), + ( + None, + True, + api_response, + None, + "[list_recipes]: The 'pagination' argument must be a non-empty string and not None.", + False, + ), + ( + None, + "(1, 'a')", + api_response, + None, + "[list_recipes]: The 'pagination' argument must be a tuple of two integers.", + False, + ), + ( + None, + "(1, 2, 3)", + api_response, + None, + "[list_recipes]: The 'pagination' argument must be a tuple of two integers.", + False, + ), + ( + None, + "(1, )", + api_response, + None, + "[list_recipes]: The 'pagination' argument must be a tuple of two integers.", + False, + ), + ( + None, + "(0, 1)", + api_response, + None, + "[list_recipes]: Invalid page number or page size. Page number and page size should start from 1.", + False, + ), + ( + None, + "(1, 0)", + api_response, + None, + "[list_recipes]: Invalid page number or page size. Page number and page size should start from 1.", + False, + ), + ( + None, + "(0, 0)", + api_response, + None, + "[list_recipes]: Invalid page number or page size. Page number and page size should start from 1.", + False, + ), + ( + None, + "(1, -1)", + api_response, + None, + "[list_recipes]: Invalid page number or page size. Page number and page size should start from 1.", + False, + ), + ( + None, + "(-1, 1)", + api_response, + None, + "[list_recipes]: Invalid page number or page size. Page number and page size should start from 1.", + False, + ), + ( + None, + "(-1, -1)", + api_response, + None, + "[list_recipes]: Invalid page number or page size. Page number and page size should start from 1.", + False, + ), + # Exception case + ( + None, + None, + api_response, + None, + "[list_recipes]: An error has occurred while listing recipes.", + False, + ), + ], + ) + @patch("moonshot.integrations.cli.benchmark.recipe.api_get_all_recipe") + @patch("moonshot.integrations.cli.benchmark.recipe._display_recipes") + def test_list_recipes( + self, + mock_display_recipes, + mock_api_get_all_recipe, + find, + pagination, + api_response, + expected_output, + expected_log, + to_be_called, + capsys, + ): + if "error" in expected_log: + mock_api_get_all_recipe.side_effect = Exception( + "An error has occurred while listing recipes." + ) + else: + mock_api_get_all_recipe.return_value = api_response + + class Args: + pass + + args = Args() + args.find = find + args.pagination = pagination + + result = list_recipes(args) + + captured = capsys.readouterr() + assert expected_log == captured.out.strip() + assert result == expected_output + + if to_be_called: + mock_display_recipes.assert_called_once_with(api_response) + else: + mock_display_recipes.assert_not_called() + + # ------------------------------------------------------------------------------ + # Test list_recipes functionality with mocked filter-data + # ------------------------------------------------------------------------------ + @pytest.mark.parametrize( + "find, pagination, api_response, filtered_response, expected_output, expected_log, to_be_called", + [ + ( + None, + None, + api_response, + api_response_pagination, + api_response_pagination, + "", + True, + ), + ( + "Recipe", + None, + api_response, + api_response_pagination, + api_response_pagination, + "", + True, + ), + ( + None, + "(0, 1)", + api_response, + api_response_pagination, + api_response_pagination, + "", + True, + ), + # Case where filtered_response is None + ( + None, + None, + api_response, + None, + None, + "There are no recipes found.", + False, + ), + # Case where filtered_response is an empty list + ( + None, + None, + api_response, + [], + None, + "There are no recipes found.", + False, + ), + ], + ) + @patch("moonshot.integrations.cli.benchmark.recipe.api_get_all_recipe") + @patch("moonshot.integrations.cli.benchmark.recipe._display_recipes") + @patch("moonshot.integrations.cli.benchmark.recipe.filter_data") + def test_list_recipes_filtered( + self, + mock_filter_data, + mock_display_recipes, + mock_api_get_all_recipe, + find, + pagination, + api_response, + filtered_response, + expected_output, + expected_log, + to_be_called, + capsys, + ): + mock_api_get_all_recipe.return_value = api_response + mock_filter_data.return_value = filtered_response + + class Args: + pass + + args = Args() + args.find = find + args.pagination = pagination + + result = list_recipes(args) + + captured = capsys.readouterr() + assert expected_log == captured.out.strip() + assert result == expected_output + + if to_be_called: + mock_display_recipes.assert_called_once_with(filtered_response) + else: + mock_display_recipes.assert_not_called() + + # ------------------------------------------------------------------------------ + # Test view_recipe functionality + # ------------------------------------------------------------------------------ + @pytest.mark.parametrize( + "recipe_id, api_response, expected_log, to_be_called", + [ + # Valid case + ( + "1", + api_response, + "", + True, + ), + # Invalid case: recipe_id is None + ( + None, + api_response, + "[view_recipe]: The 'recipe' argument must be a non-empty string and not None.", + False, + ), + # Invalid case: recipe_id is not a string + ( + "", + api_response, + "[view_recipe]: The 'recipe' argument must be a non-empty string and not None.", + False, + ), + ( + 123, + api_response, + "[view_recipe]: The 'recipe' argument must be a non-empty string and not None.", + False, + ), + ( + {}, + api_response, + "[view_recipe]: The 'recipe' argument must be a non-empty string and not None.", + False, + ), + ( + [], + api_response, + "[view_recipe]: The 'recipe' argument must be a non-empty string and not None.", + False, + ), + ( + (), + api_response, + "[view_recipe]: The 'recipe' argument must be a non-empty string and not None.", + False, + ), + ( + True, + api_response, + "[view_recipe]: The 'recipe' argument must be a non-empty string and not None.", + False, + ), + # Exception case: api_read_recipe raises an exception + ( + "1", + api_response, + "[view_recipe]: An error has occurred while reading the recipe.", + False, + ), + ], + ) + @patch("moonshot.integrations.cli.benchmark.recipe.api_read_recipe") + @patch("moonshot.integrations.cli.benchmark.recipe._display_recipes") + def test_view_recipe( + self, + mock_display_recipes, + mock_api_read_recipe, + recipe_id, + api_response, + expected_log, + to_be_called, + capsys, + ): + if "error" in expected_log: + mock_api_read_recipe.side_effect = Exception( + "An error has occurred while reading the recipe." + ) + else: + mock_api_read_recipe.return_value = api_response + + class Args: + pass + + args = Args() + args.recipe = recipe_id + + view_recipe(args) + + captured = capsys.readouterr() + assert expected_log == captured.out.strip() + + if to_be_called: + mock_display_recipes.assert_called_once_with([api_response]) + else: + mock_display_recipes.assert_not_called() + + # ------------------------------------------------------------------------------ + # Test run_recipe functionality + # ------------------------------------------------------------------------------ + @pytest.mark.parametrize( + "name, recipes, endpoints, num_of_prompts, random_seed, system_prompt, \ + runner_proc_module, result_proc_module, expected_log", + [ + # Valid case + ( + "Test Runner", + "['recipe1', 'recipe2']", + "['endpoint1', 'endpoint2']", + 10, + 42, + "Test system prompt", + "runner_module", + "result_module", + "", + ), + # Invalid case: name + ( + "", + "['recipe1', 'recipe2']", + "['endpoint1', 'endpoint2']", + 10, + 42, + "Test system prompt", + "runner_module", + "result_module", + "[run_recipe]: The 'name' argument must be a non-empty string and not None.", + ), + ( + None, + "['recipe1', 'recipe2']", + "['endpoint1', 'endpoint2']", + 10, + 42, + "Test system prompt", + "runner_module", + "result_module", + "[run_recipe]: The 'name' argument must be a non-empty string and not None.", + ), + ( + 123, + "['recipe1', 'recipe2']", + "['endpoint1', 'endpoint2']", + 10, + 42, + "Test system prompt", + "runner_module", + "result_module", + "[run_recipe]: The 'name' argument must be a non-empty string and not None.", + ), + ( + {}, + "['recipe1', 'recipe2']", + "['endpoint1', 'endpoint2']", + 10, + 42, + "Test system prompt", + "runner_module", + "result_module", + "[run_recipe]: The 'name' argument must be a non-empty string and not None.", + ), + ( + [], + "['recipe1', 'recipe2']", + "['endpoint1', 'endpoint2']", + 10, + 42, + "Test system prompt", + "runner_module", + "result_module", + "[run_recipe]: The 'name' argument must be a non-empty string and not None.", + ), + ( + (), + "['recipe1', 'recipe2']", + "['endpoint1', 'endpoint2']", + 10, + 42, + "Test system prompt", + "runner_module", + "result_module", + "[run_recipe]: The 'name' argument must be a non-empty string and not None.", + ), + ( + True, + "['recipe1', 'recipe2']", + "['endpoint1', 'endpoint2']", + 10, + 42, + "Test system prompt", + "runner_module", + "result_module", + "[run_recipe]: The 'name' argument must be a non-empty string and not None.", + ), + # Invalid case: recipes is not a list of string + ( + "Test Runner", + "[123, 123]", + "['endpoint1', 'endpoint2']", + 10, + 42, + "Test system prompt", + "runner_module", + "result_module", + "[run_recipe]: The 'recipes' argument must evaluate to a list of strings.", + ), + # Invalid case: recipes is not a string + ( + "Test Runner", + None, + "['endpoint1', 'endpoint2']", + 10, + 42, + "Test system prompt", + "runner_module", + "result_module", + "[run_recipe]: The 'recipes' argument must be a non-empty string and not None.", + ), + ( + "Test Runner", + "", + "['endpoint1', 'endpoint2']", + 10, + 42, + "Test system prompt", + "runner_module", + "result_module", + "[run_recipe]: The 'recipes' argument must be a non-empty string and not None.", + ), + ( + "Test Runner", + 123, + "['endpoint1', 'endpoint2']", + 10, + 42, + "Test system prompt", + "runner_module", + "result_module", + "[run_recipe]: The 'recipes' argument must be a non-empty string and not None.", + ), + ( + "Test Runner", + {}, + "['endpoint1', 'endpoint2']", + 10, + 42, + "Test system prompt", + "runner_module", + "result_module", + "[run_recipe]: The 'recipes' argument must be a non-empty string and not None.", + ), + ( + "Test Runner", + [], + "['endpoint1', 'endpoint2']", + 10, + 42, + "Test system prompt", + "runner_module", + "result_module", + "[run_recipe]: The 'recipes' argument must be a non-empty string and not None.", + ), + ( + "Test Runner", + (), + "['endpoint1', 'endpoint2']", + 10, + 42, + "Test system prompt", + "runner_module", + "result_module", + "[run_recipe]: The 'recipes' argument must be a non-empty string and not None.", + ), + ( + "Test Runner", + True, + "['endpoint1', 'endpoint2']", + 10, + 42, + "Test system prompt", + "runner_module", + "result_module", + "[run_recipe]: The 'recipes' argument must be a non-empty string and not None.", + ), + # Invalid case: endpoints is not a list of string + ( + "Test Runner", + "['recipe1', 'recipe2']", + "[123, 123]", + 10, + 42, + "Test system prompt", + "runner_module", + "result_module", + "[run_recipe]: The 'endpoints' argument must evaluate to a list of strings.", + ), + # Invalid case: endpoints is not a string + ( + "Test Runner", + "['recipe1', 'recipe2']", + None, + 10, + 42, + "Test system prompt", + "runner_module", + "result_module", + "[run_recipe]: The 'endpoints' argument must be a non-empty string and not None.", + ), + ( + "Test Runner", + "['recipe1', 'recipe2']", + "", + 10, + 42, + "Test system prompt", + "runner_module", + "result_module", + "[run_recipe]: The 'endpoints' argument must be a non-empty string and not None.", + ), + ( + "Test Runner", + "['recipe1', 'recipe2']", + 123, + 10, + 42, + "Test system prompt", + "runner_module", + "result_module", + "[run_recipe]: The 'endpoints' argument must be a non-empty string and not None.", + ), + ( + "Test Runner", + "['recipe1', 'recipe2']", + {}, + 10, + 42, + "Test system prompt", + "runner_module", + "result_module", + "[run_recipe]: The 'endpoints' argument must be a non-empty string and not None.", + ), + ( + "Test Runner", + "['recipe1', 'recipe2']", + [], + 10, + 42, + "Test system prompt", + "runner_module", + "result_module", + "[run_recipe]: The 'endpoints' argument must be a non-empty string and not None.", + ), + ( + "Test Runner", + "['recipe1', 'recipe2']", + (), + 10, + 42, + "Test system prompt", + "runner_module", + "result_module", + "[run_recipe]: The 'endpoints' argument must be a non-empty string and not None.", + ), + ( + "Test Runner", + "['recipe1', 'recipe2']", + True, + 10, + 42, + "Test system prompt", + "runner_module", + "result_module", + "[run_recipe]: The 'endpoints' argument must be a non-empty string and not None.", + ), + # Invalid case: num_of_prompts is not an integer + ( + "Test Runner", + "['recipe1', 'recipe2']", + "['endpoint1', 'endpoint2']", + None, + 42, + "Test system prompt", + "runner_module", + "result_module", + "[run_recipe]: The 'num_of_prompts' argument must be an integer.", + ), + ( + "Test Runner", + "['recipe1', 'recipe2']", + "['endpoint1', 'endpoint2']", + "", + 42, + "Test system prompt", + "runner_module", + "result_module", + "[run_recipe]: The 'num_of_prompts' argument must be an integer.", + ), + ( + "Test Runner", + "['recipe1', 'recipe2']", + "['endpoint1', 'endpoint2']", + {}, + 42, + "Test system prompt", + "runner_module", + "result_module", + "[run_recipe]: The 'num_of_prompts' argument must be an integer.", + ), + ( + "Test Runner", + "['recipe1', 'recipe2']", + "['endpoint1', 'endpoint2']", + [], + 42, + "Test system prompt", + "runner_module", + "result_module", + "[run_recipe]: The 'num_of_prompts' argument must be an integer.", + ), + ( + "Test Runner", + "['recipe1', 'recipe2']", + "['endpoint1', 'endpoint2']", + (), + 42, + "Test system prompt", + "runner_module", + "result_module", + "[run_recipe]: The 'num_of_prompts' argument must be an integer.", + ), + ( + "Test Runner", + "['recipe1', 'recipe2']", + "['endpoint1', 'endpoint2']", + True, + 42, + "Test system prompt", + "runner_module", + "result_module", + "[run_recipe]: The 'num_of_prompts' argument must be an integer.", + ), + # Invalid case: random_seed is not an integer + ( + "Test Runner", + "['recipe1', 'recipe2']", + "['endpoint1', 'endpoint2']", + 10, + None, + "Test system prompt", + "runner_module", + "result_module", + "[run_recipe]: The 'random_seed' argument must be an integer.", + ), + ( + "Test Runner", + "['recipe1', 'recipe2']", + "['endpoint1', 'endpoint2']", + 10, + "", + "Test system prompt", + "runner_module", + "result_module", + "[run_recipe]: The 'random_seed' argument must be an integer.", + ), + ( + "Test Runner", + "['recipe1', 'recipe2']", + "['endpoint1', 'endpoint2']", + 10, + {}, + "Test system prompt", + "runner_module", + "result_module", + "[run_recipe]: The 'random_seed' argument must be an integer.", + ), + ( + "Test Runner", + "['recipe1', 'recipe2']", + "['endpoint1', 'endpoint2']", + 10, + [], + "Test system prompt", + "runner_module", + "result_module", + "[run_recipe]: The 'random_seed' argument must be an integer.", + ), + ( + "Test Runner", + "['recipe1', 'recipe2']", + "['endpoint1', 'endpoint2']", + 10, + (), + "Test system prompt", + "runner_module", + "result_module", + "[run_recipe]: The 'random_seed' argument must be an integer.", + ), + ( + "Test Runner", + "['recipe1', 'recipe2']", + "['endpoint1', 'endpoint2']", + 10, + True, + "Test system prompt", + "runner_module", + "result_module", + "[run_recipe]: The 'random_seed' argument must be an integer.", + ), + # Invalid case: system_prompt is None + ( + "Test Runner", + "['recipe1', 'recipe2']", + "['endpoint1', 'endpoint2']", + 10, + 42, + None, + "runner_module", + "result_module", + "[run_recipe]: The 'system_prompt' argument must be a non-empty string and not None.", + ), + ( + "Test Runner", + "['recipe1', 'recipe2']", + "['endpoint1', 'endpoint2']", + 10, + 42, + "", + "runner_module", + "result_module", + "[run_recipe]: The 'system_prompt' argument must be a non-empty string and not None.", + ), + ( + "Test Runner", + "['recipe1', 'recipe2']", + "['endpoint1', 'endpoint2']", + 10, + 42, + {}, + "runner_module", + "result_module", + "[run_recipe]: The 'system_prompt' argument must be a non-empty string and not None.", + ), + ( + "Test Runner", + "['recipe1', 'recipe2']", + "['endpoint1', 'endpoint2']", + 10, + 42, + [], + "runner_module", + "result_module", + "[run_recipe]: The 'system_prompt' argument must be a non-empty string and not None.", + ), + ( + "Test Runner", + "['recipe1', 'recipe2']", + "['endpoint1', 'endpoint2']", + 10, + 42, + (), + "runner_module", + "result_module", + "[run_recipe]: The 'system_prompt' argument must be a non-empty string and not None.", + ), + ( + "Test Runner", + "['recipe1', 'recipe2']", + "['endpoint1', 'endpoint2']", + 10, + 42, + True, + "runner_module", + "result_module", + "[run_recipe]: The 'system_prompt' argument must be a non-empty string and not None.", + ), + # Invalid case: runner_proc_module is None + ( + "Test Runner", + "['recipe1', 'recipe2']", + "['endpoint1', 'endpoint2']", + 10, + 42, + "Test system prompt", + None, + "result_module", + "[run_recipe]: The 'runner_proc_module' argument must be a non-empty string and not None.", + ), + ( + "Test Runner", + "['recipe1', 'recipe2']", + "['endpoint1', 'endpoint2']", + 10, + 42, + "Test system prompt", + "", + "result_module", + "[run_recipe]: The 'runner_proc_module' argument must be a non-empty string and not None.", + ), + ( + "Test Runner", + "['recipe1', 'recipe2']", + "['endpoint1', 'endpoint2']", + 10, + 42, + "Test system prompt", + {}, + "result_module", + "[run_recipe]: The 'runner_proc_module' argument must be a non-empty string and not None.", + ), + ( + "Test Runner", + "['recipe1', 'recipe2']", + "['endpoint1', 'endpoint2']", + 10, + 42, + "Test system prompt", + [], + "result_module", + "[run_recipe]: The 'runner_proc_module' argument must be a non-empty string and not None.", + ), + ( + "Test Runner", + "['recipe1', 'recipe2']", + "['endpoint1', 'endpoint2']", + 10, + 42, + "Test system prompt", + (), + "result_module", + "[run_recipe]: The 'runner_proc_module' argument must be a non-empty string and not None.", + ), + ( + "Test Runner", + "['recipe1', 'recipe2']", + "['endpoint1', 'endpoint2']", + 10, + 42, + "Test system prompt", + True, + "result_module", + "[run_recipe]: The 'runner_proc_module' argument must be a non-empty string and not None.", + ), + # Invalid case: result_proc_module is None + ( + "Test Runner", + "['recipe1', 'recipe2']", + "['endpoint1', 'endpoint2']", + 10, + 42, + "Test system prompt", + "runner_module", + None, + "[run_recipe]: The 'result_proc_module' argument must be a non-empty string and not None.", + ), + ( + "Test Runner", + "['recipe1', 'recipe2']", + "['endpoint1', 'endpoint2']", + 10, + 42, + "Test system prompt", + "runner_module", + "", + "[run_recipe]: The 'result_proc_module' argument must be a non-empty string and not None.", + ), + ( + "Test Runner", + "['recipe1', 'recipe2']", + "['endpoint1', 'endpoint2']", + 10, + 42, + "Test system prompt", + "runner_module", + {}, + "[run_recipe]: The 'result_proc_module' argument must be a non-empty string and not None.", + ), + ( + "Test Runner", + "['recipe1', 'recipe2']", + "['endpoint1', 'endpoint2']", + 10, + 42, + "Test system prompt", + "runner_module", + [], + "[run_recipe]: The 'result_proc_module' argument must be a non-empty string and not None.", + ), + ( + "Test Runner", + "['recipe1', 'recipe2']", + "['endpoint1', 'endpoint2']", + 10, + 42, + "Test system prompt", + "runner_module", + (), + "[run_recipe]: The 'result_proc_module' argument must be a non-empty string and not None.", + ), + ( + "Test Runner", + "['recipe1', 'recipe2']", + "['endpoint1', 'endpoint2']", + 10, + 42, + "Test system prompt", + "runner_module", + True, + "[run_recipe]: The 'result_proc_module' argument must be a non-empty string and not None.", + ), + # Exception case: api_create_runner raises an exception + ( + "Test Runner", + "['recipe1', 'recipe2']", + "['endpoint1', 'endpoint2']", + 10, + 42, + "Test system prompt", + "runner_module", + "result_module", + "[run_recipe]: An error has occurred while creating the runner.", + ), + # Exception case: api_load_runner raises an exception + ( + "Test Runner", + "['recipe1', 'recipe2']", + "['endpoint1', 'endpoint2']", + 10, + 42, + "Test system prompt", + "runner_module", + "result_module", + "[run_recipe]: An error has occurred while loading the runner.", + ), + # Exception case: api_get_all_runner_name raises an exception + ( + "Test Runner", + "['recipe1', 'recipe2']", + "['endpoint1', 'endpoint2']", + 10, + 42, + "Test system prompt", + "runner_module", + "result_module", + "[run_recipe]: An error has occurred while getting all runner names.", + ), + # Exception case: api_get_all_run raises an exception + ( + "Test Runner", + "['recipe1', 'recipe2']", + "['endpoint1', 'endpoint2']", + 10, + 42, + "Test system prompt", + "runner_module", + "result_module", + "[run_recipe]: An error has occurred while getting all runs.", + ), + # Exception case: no results raises an exception + ( + "Test Runner", + "['recipe1', 'recipe2']", + "['endpoint1', 'endpoint2']", + 10, + 42, + "Test system prompt", + "runner_module", + "result_module", + "[run_recipe]: There are no results generated.", + ), + # Exception case: show_recipe_results raises an exception + ( + "Test Runner", + "['recipe1', 'recipe2']", + "['endpoint1', 'endpoint2']", + 10, + 42, + "Test system prompt", + "runner_module", + "result_module", + "[run_recipe]: An error has occurred while showing recipe results.", + ), + ], + ) + @patch("moonshot.integrations.cli.benchmark.recipe.api_get_all_runner_name") + @patch("moonshot.integrations.cli.benchmark.recipe.api_load_runner") + @patch("moonshot.integrations.cli.benchmark.recipe.api_create_runner") + @patch("moonshot.integrations.cli.benchmark.recipe.api_get_all_run") + @patch("moonshot.integrations.cli.benchmark.recipe._show_recipe_results") + def test_run_recipe( + self, + mock_show_recipe_results, + mock_api_get_all_run, + mock_api_create_runner, + mock_api_load_runner, + mock_api_get_all_runner_name, + name, + recipes, + endpoints, + num_of_prompts, + random_seed, + system_prompt, + runner_proc_module, + result_proc_module, + expected_log, + capsys, + ): + to_trigger_called = False + + if "getting all runner names" in expected_log: + mock_api_get_all_runner_name.side_effect = Exception( + "An error has occurred while getting all runner names." + ) + + elif "creating the runner" in expected_log: + mock_api_get_all_runner_name.return_value = [] + mock_api_create_runner.side_effect = Exception( + "An error has occurred while creating the runner." + ) + + elif "loading the runner" in expected_log: + mock_api_get_all_runner_name.return_value = ["test-runner"] + mock_api_load_runner.side_effect = Exception( + "An error has occurred while loading the runner." + ) + + elif "getting all runs" in expected_log: + mock_api_get_all_runner_name.return_value = [] + mock_api_create_runner.return_value = AsyncMock() + mock_api_get_all_run.side_effect = Exception( + "An error has occurred while getting all runs." + ) + + elif "showing recipe results" in expected_log: + to_trigger_called = True + mock_api_get_all_runner_name.return_value = [] + mock_api_create_runner.return_value = AsyncMock() + mock_api_get_all_run.return_value = [ + {"results": {"metadata": {"duration": 10}}} + ] + mock_show_recipe_results.side_effect = Exception( + "An error has occurred while showing recipe results." + ) + + elif "no results" in expected_log: + mock_api_get_all_runner_name.return_value = [] + mock_api_create_runner.return_value = AsyncMock() + mock_api_get_all_run.return_value = [ + {"someresults": {"metadata": {"duration": 10}}} + ] + + else: + mock_api_create_runner.return_value = AsyncMock() + mock_api_load_runner.return_value = AsyncMock() + mock_api_get_all_runner_name.return_value = [] + mock_api_get_all_run.return_value = [ + {"results": {"metadata": {"duration": 10}}} + ] + + class Args: + pass + + args = Args() + args.name = name + args.recipes = recipes + args.endpoints = endpoints + args.num_of_prompts = num_of_prompts + args.random_seed = random_seed + args.system_prompt = system_prompt + args.runner_proc_module = runner_proc_module + args.result_proc_module = result_proc_module + + run_recipe(args) + + captured = capsys.readouterr() + assert expected_log == captured.out.strip() + + if not expected_log or to_trigger_called: + mock_show_recipe_results.assert_called_once() + else: + mock_show_recipe_results.assert_not_called() + + # ------------------------------------------------------------------------------ + # Test update_recipe functionality + # ------------------------------------------------------------------------------ + @pytest.mark.parametrize( + "recipe, update_values, expected_log, to_be_called", + [ + # Valid case + ( + "Recipe 1", + "[('name', 'Updated Recipe'), ('description', 'Updated description')]", + "[update_recipe]: Recipe updated.", + True, + ), + # Invalid case - recipe + ( + "", + "[('name', 'Updated Recipe'), ('description', 'Updated description')]", + "[update_recipe]: The 'recipe' argument must be a non-empty string and not None.", + False, + ), + ( + None, + "[('name', 'Updated Recipe'), ('description', 'Updated description')]", + "[update_recipe]: The 'recipe' argument must be a non-empty string and not None.", + False, + ), + ( + 123, + "[('name', 'Updated Recipe'), ('description', 'Updated description')]", + "[update_recipe]: The 'recipe' argument must be a non-empty string and not None.", + False, + ), + ( + {}, + "[('name', 'Updated Recipe'), ('description', 'Updated description')]", + "[update_recipe]: The 'recipe' argument must be a non-empty string and not None.", + False, + ), + ( + [], + "[('name', 'Updated Recipe'), ('description', 'Updated description')]", + "[update_recipe]: The 'recipe' argument must be a non-empty string and not None.", + False, + ), + ( + (), + "[('name', 'Updated Recipe'), ('description', 'Updated description')]", + "[update_recipe]: The 'recipe' argument must be a non-empty string and not None.", + False, + ), + ( + True, + "[('name', 'Updated Recipe'), ('description', 'Updated description')]", + "[update_recipe]: The 'recipe' argument must be a non-empty string and not None.", + False, + ), + # Invalid case - update values + ( + "Recipe 1", + "", + "[update_recipe]: The 'update_values' argument must be a non-empty string and not None.", + False, + ), + ( + "Recipe 1", + "['', '']", + "[update_recipe]: The 'update_values' argument must evaluate to a list of tuples.", + False, + ), + ( + "Recipe 1", + "[[], ()]", + "[update_recipe]: The 'update_values' argument must evaluate to a list of tuples.", + False, + ), + ( + "Recipe 1", + None, + "[update_recipe]: The 'update_values' argument must be a non-empty string and not None.", + False, + ), + ( + "Recipe 1", + 123, + "[update_recipe]: The 'update_values' argument must be a non-empty string and not None.", + False, + ), + ( + "Recipe 1", + {}, + "[update_recipe]: The 'update_values' argument must be a non-empty string and not None.", + False, + ), + ( + "Recipe 1", + [], + "[update_recipe]: The 'update_values' argument must be a non-empty string and not None.", + False, + ), + ( + "Recipe 1", + (), + "[update_recipe]: The 'update_values' argument must be a non-empty string and not None.", + False, + ), + ( + "Recipe 1", + True, + "[update_recipe]: The 'update_values' argument must be a non-empty string and not None.", + False, + ), + # Test case: API update raises an exception + ( + "Recipe 1", + "[('name', 'Updated Recipe'), ('description', 'Updated description')]", + "[update_recipe]: An error has occurred while updating the recipe.", + True, + ), + ], + ) + @patch("moonshot.integrations.cli.benchmark.recipe.api_update_recipe") + def test_update_recipe( + self, + mock_api_update_recipe, + capsys, + recipe, + update_values, + expected_log, + to_be_called, + ): + if "error" in expected_log: + mock_api_update_recipe.side_effect = Exception( + "An error has occurred while updating the recipe." + ) + else: + mock_api_update_recipe.return_value = "updated" + + class Args: + pass + + args = Args() + args.recipe = recipe + args.update_values = update_values + + update_recipe(args) + + captured = capsys.readouterr() + assert expected_log == captured.out.strip() + + if to_be_called: + mock_api_update_recipe.assert_called_once_with( + args.recipe, **dict(literal_eval(args.update_values)) + ) + else: + mock_api_update_recipe.assert_not_called() + + # ------------------------------------------------------------------------------ + # Test delete_recipe functionality + # ------------------------------------------------------------------------------ + @pytest.mark.parametrize( + "recipe, expected_log, to_be_called", + [ + # Valid case + ("Recipe 1", "[delete_recipe]: Recipe deleted.", True), + # Invalid case - recipe + ( + "", + "[delete_recipe]: The 'recipe' argument must be a non-empty string and not None.", + False, + ), + ( + None, + "[delete_recipe]: The 'recipe' argument must be a non-empty string and not None.", + False, + ), + ( + 123, + "[delete_recipe]: The 'recipe' argument must be a non-empty string and not None.", + False, + ), + ( + {}, + "[delete_recipe]: The 'recipe' argument must be a non-empty string and not None.", + False, + ), + ( + [], + "[delete_recipe]: The 'recipe' argument must be a non-empty string and not None.", + False, + ), + ( + (), + "[delete_recipe]: The 'recipe' argument must be a non-empty string and not None.", + False, + ), + ( + True, + "[delete_recipe]: The 'recipe' argument must be a non-empty string and not None.", + False, + ), + ], + ) + @patch("moonshot.integrations.cli.benchmark.recipe.api_delete_recipe") + def test_delete_recipe( + self, mock_api_delete_recipe, capsys, recipe, expected_log, to_be_called + ): + class Args: + pass + + args = Args() + args.recipe = recipe + + with patch( + "moonshot.integrations.cli.benchmark.recipe.console.input", + return_value="y", + ): + with patch("moonshot.integrations.cli.benchmark.recipe.console.print"): + delete_recipe(args) + + captured = capsys.readouterr() + assert expected_log == captured.out.strip() + + if to_be_called: + mock_api_delete_recipe.assert_called_once_with(args.recipe) + else: + mock_api_delete_recipe.assert_not_called() + + @patch("moonshot.integrations.cli.benchmark.recipe.console.input", return_value="y") + @patch("moonshot.integrations.cli.benchmark.recipe.api_delete_recipe") + def test_delete_recipe_confirm_yes(self, mock_delete, mock_input): + args = MagicMock() + args.recipe = "test_recipe_id" + + delete_recipe(args) + + mock_input.assert_called_once_with( + "[bold red]Are you sure you want to delete the recipe (y/N)? [/]" + ) + mock_delete.assert_called_once_with("test_recipe_id") + + @patch("moonshot.integrations.cli.benchmark.recipe.console.input", return_value="n") + @patch("moonshot.integrations.cli.benchmark.recipe.api_delete_recipe") + def test_delete_recipe_confirm_no(self, mock_delete, mock_input): + args = MagicMock() + args.recipe = "test_recipe_id" + + delete_recipe(args) + + mock_input.assert_called_once_with( + "[bold red]Are you sure you want to delete the recipe (y/N)? [/]" + ) + mock_delete.assert_not_called() + + @patch("moonshot.integrations.cli.benchmark.recipe.console.input", return_value="n") + @patch("moonshot.integrations.cli.benchmark.recipe.console.print") + @patch("moonshot.integrations.cli.benchmark.recipe.api_delete_recipe") + def test_delete_recipe_cancelled_output(self, mock_delete, mock_print, mock_input): + args = MagicMock() + args.recipe = "test_recipe_id" + + delete_recipe(args) + + mock_input.assert_called_once_with( + "[bold red]Are you sure you want to delete the recipe (y/N)? [/]" + ) + mock_print.assert_called_once_with("[bold yellow]Recipe deletion cancelled.[/]") + mock_delete.assert_not_called() diff --git a/tests/unit-tests/cli/test_result.py b/tests/unit-tests/cli/test_result.py new file mode 100644 index 00000000..f14b622d --- /dev/null +++ b/tests/unit-tests/cli/test_result.py @@ -0,0 +1,917 @@ +from unittest.mock import MagicMock, patch + +import pytest + +from moonshot.integrations.cli.benchmark.result import ( + delete_result, + list_results, + view_result, +) + + +class TestCollectionCliResult: + api_response = [ + { + "metadata": { + "id": "my-new-recipe-runner", + "start_time": "2024-08-17 13:52:16", + "end_time": "2024-08-17 13:52:48", + "duration": 32, + "status": "completed", + "recipes": ["bbq", "mmlu"], + "cookbooks": None, + "endpoints": ["openai-gpt35-turbo"], + "num_of_prompts": 1, + "random_seed": 1, + "system_prompt": "You are an intelligent AI", + }, + "results": { + "recipes": [ + { + "id": "bbq", + "details": [ + { + "model_id": "openai-gpt35-turbo", + "dataset_id": "bbq-lite-disability-status-ambiguous", + "prompt_template_id": "mcq-template", + "data": [ + { + "prompt": '"Some prompts.', + "predicted_result": "A. Can't be determined", + "target": "A. Can't be determined", + "duration": 8.763961791002657, + } + ], + "metrics": [ + { + "accuracy": 100.0, + "grading_criteria": {"accuracy": 100.0}, + } + ], + } + ], + } + ], + "evaluation_summary": [ + { + "model_id": "openai-gpt35-turbo", + "num_of_prompts": 1, + "avg_grade_value": 0.0, + "grade": "E", + } + ], + "grading_scale": { + "A": [80, 100], + "B": [60, 79], + "C": [40, 59], + "D": [20, 39], + "E": [0, 19], + }, + "total_num_of_prompts": 1, + }, + } + ] + api_response_pagination = [ + { + "metadata": { + "id": "my-new-recipe-runner", + "start_time": "2024-08-17 13:52:16", + "end_time": "2024-08-17 13:52:48", + "duration": 32, + "status": "completed", + "recipes": ["bbq", "mmlu"], + "cookbooks": None, + "endpoints": ["openai-gpt35-turbo"], + "num_of_prompts": 1, + "random_seed": 1, + "system_prompt": "You are an intelligent AI", + }, + "results": { + "recipes": [ + { + "id": "bbq", + "details": [ + { + "model_id": "openai-gpt35-turbo", + "dataset_id": "bbq-lite-disability-status-ambiguous", + "prompt_template_id": "mcq-template", + "data": [ + { + "prompt": '"Some prompts.', + "predicted_result": "A. Can't be determined", + "target": "A. Can't be determined", + "duration": 8.763961791002657, + } + ], + "metrics": [ + { + "accuracy": 100.0, + "grading_criteria": {"accuracy": 100.0}, + } + ], + } + ], + } + ], + "evaluation_summary": [ + { + "model_id": "openai-gpt35-turbo", + "num_of_prompts": 1, + "avg_grade_value": 0.0, + "grade": "E", + } + ], + "grading_scale": { + "A": [80, 100], + "B": [60, 79], + "C": [40, 59], + "D": [20, 39], + "E": [0, 19], + }, + "total_num_of_prompts": 1, + }, + "idx": 1, + } + ] + api_read_response_recipe = { + "metadata": { + "id": "my-new-recipe-runner", + "start_time": "2024-08-17 13:52:16", + "end_time": "2024-08-17 13:52:48", + "duration": 32, + "status": "completed", + "recipes": ["mmlu"], + "cookbooks": None, + "endpoints": ["openai-gpt35-turbo"], + "num_of_prompts": 1, + "random_seed": 1, + "system_prompt": "You are an intelligent AI", + }, + "results": { + "recipes": [ + { + "id": "bbq", + "details": [ + { + "model_id": "openai-gpt35-turbo", + "dataset_id": "bbq-lite-disability-status-ambiguous", + "prompt_template_id": "mcq-template", + "data": [ + { + "prompt": '"Some prompts.', + "predicted_result": "A. Can't be determined", + "target": "A. Can't be determined", + "duration": 8.763961791002657, + } + ], + "metrics": [ + { + "accuracy": 100.0, + "grading_criteria": {"accuracy": 100.0}, + } + ], + } + ], + } + ], + "evaluation_summary": [ + { + "model_id": "openai-gpt35-turbo", + "num_of_prompts": 1, + "avg_grade_value": 0.0, + "grade": "E", + } + ], + "grading_scale": { + "A": [80, 100], + "B": [60, 79], + "C": [40, 59], + "D": [20, 39], + "E": [0, 19], + }, + "total_num_of_prompts": 1, + }, + } + api_read_response_cookbook = { + "metadata": { + "id": "my-new-recipe-runner", + "start_time": "2024-08-17 13:52:16", + "end_time": "2024-08-17 13:52:48", + "duration": 32, + "status": "completed", + "recipes": None, + "cookbooks": ["bbq_cookbook"], + "endpoints": ["openai-gpt35-turbo"], + "num_of_prompts": 1, + "random_seed": 1, + "system_prompt": "You are an intelligent AI", + }, + "results": { + "recipes": [ + { + "id": "bbq", + "details": [ + { + "model_id": "openai-gpt35-turbo", + "dataset_id": "bbq-lite-disability-status-ambiguous", + "prompt_template_id": "mcq-template", + "data": [ + { + "prompt": '"Some prompts.', + "predicted_result": "A. Can't be determined", + "target": "A. Can't be determined", + "duration": 8.763961791002657, + } + ], + "metrics": [ + { + "accuracy": 100.0, + "grading_criteria": {"accuracy": 100.0}, + } + ], + } + ], + } + ], + "evaluation_summary": [ + { + "model_id": "openai-gpt35-turbo", + "num_of_prompts": 1, + "avg_grade_value": 0.0, + "grade": "E", + } + ], + "grading_scale": { + "A": [80, 100], + "B": [60, 79], + "C": [40, 59], + "D": [20, 39], + "E": [0, 19], + }, + "total_num_of_prompts": 1, + }, + } + + @pytest.fixture(autouse=True) + def init(self): + # Perform tests + yield + + # ------------------------------------------------------------------------------ + # Test list_results functionality with non-mocked filter-data + # ------------------------------------------------------------------------------ + @pytest.mark.parametrize( + "find, pagination, api_response, expected_output, expected_log, to_be_called", + [ + # Valid cases + ( + None, + None, + api_response, + api_response, + "", + True, + ), + # No results + ( + None, + None, + [], + None, + "There are no results found.", + False, + ), + ( + "my-new-recipe-runner", + None, + api_response, + api_response, + "", + True, + ), + ( + None, + "(1, 1)", + api_response, + api_response_pagination, + "", + True, + ), + ( + "Results", + "(1, 1)", + api_response, + None, + "There are no results found.", + False, + ), + # Invalid cases for find + ( + "", + None, + api_response, + None, + "[list_results]: The 'find' argument must be a non-empty string and not None.", + False, + ), + ( + 99, + None, + api_response, + None, + "[list_results]: The 'find' argument must be a non-empty string and not None.", + False, + ), + ( + {}, + None, + api_response, + None, + "[list_results]: The 'find' argument must be a non-empty string and not None.", + False, + ), + ( + [], + None, + api_response, + None, + "[list_results]: The 'find' argument must be a non-empty string and not None.", + False, + ), + ( + (), + None, + api_response, + None, + "[list_results]: The 'find' argument must be a non-empty string and not None.", + False, + ), + ( + True, + None, + api_response, + None, + "[list_results]: The 'find' argument must be a non-empty string and not None.", + False, + ), + # Invalid cases for pagination + ( + None, + "", + api_response, + None, + "[list_results]: The 'pagination' argument must be a non-empty string and not None.", + False, + ), + ( + None, + 99, + api_response, + None, + "[list_results]: The 'pagination' argument must be a non-empty string and not None.", + False, + ), + ( + None, + {}, + api_response, + None, + "[list_results]: The 'pagination' argument must be a non-empty string and not None.", + False, + ), + ( + None, + [], + api_response, + None, + "[list_results]: The 'pagination' argument must be a non-empty string and not None.", + False, + ), + ( + None, + (), + api_response, + None, + "[list_results]: The 'pagination' argument must be a non-empty string and not None.", + False, + ), + ( + None, + True, + api_response, + None, + "[list_results]: The 'pagination' argument must be a non-empty string and not None.", + False, + ), + ( + None, + True, + api_response, + None, + "[list_results]: The 'pagination' argument must be a non-empty string and not None.", + False, + ), + ( + None, + "(1, 'a')", + api_response, + None, + "[list_results]: The 'pagination' argument must be a tuple of two integers.", + False, + ), + ( + None, + "(1, 2, 3)", + api_response, + None, + "[list_results]: The 'pagination' argument must be a tuple of two integers.", + False, + ), + ( + None, + "(1, )", + api_response, + None, + "[list_results]: The 'pagination' argument must be a tuple of two integers.", + False, + ), + ( + None, + "(0, 1)", + api_response, + None, + "[list_results]: Invalid page number or page size. Page number and page size should start from 1.", + False, + ), + ( + None, + "(1, 0)", + api_response, + None, + "[list_results]: Invalid page number or page size. Page number and page size should start from 1.", + False, + ), + ( + None, + "(0, 0)", + api_response, + None, + "[list_results]: Invalid page number or page size. Page number and page size should start from 1.", + False, + ), + ( + None, + "(1, -1)", + api_response, + None, + "[list_results]: Invalid page number or page size. Page number and page size should start from 1.", + False, + ), + ( + None, + "(-1, 1)", + api_response, + None, + "[list_results]: Invalid page number or page size. Page number and page size should start from 1.", + False, + ), + ( + None, + "(-1, -1)", + api_response, + None, + "[list_results]: Invalid page number or page size. Page number and page size should start from 1.", + False, + ), + # Exception case + ( + None, + None, + api_response, + None, + "[list_results]: An error has occurred while listing results.", + False, + ), + ], + ) + @patch("moonshot.integrations.cli.benchmark.result.api_get_all_result") + @patch("moonshot.integrations.cli.benchmark.result._display_results") + def test_list_results( + self, + mock_display_results, + mock_api_get_all_results, + find, + pagination, + api_response, + expected_output, + expected_log, + to_be_called, + capsys, + ): + if "error" in expected_log: + mock_api_get_all_results.side_effect = Exception( + "An error has occurred while listing results." + ) + else: + mock_api_get_all_results.return_value = api_response + + class Args: + pass + + args = Args() + args.find = find + args.pagination = pagination + + result = list_results(args) + + captured = capsys.readouterr() + assert expected_log == captured.out.strip() + assert result == expected_output + + if to_be_called: + mock_display_results.assert_called_once_with(api_response) + else: + mock_display_results.assert_not_called() + + # ------------------------------------------------------------------------------ + # Test list_results functionality with mocked filter-data + # ------------------------------------------------------------------------------ + @pytest.mark.parametrize( + "find, pagination, api_response, filtered_response, expected_output, expected_log, to_be_called", + [ + ( + None, + None, + api_response, + api_response_pagination, + api_response_pagination, + "", + True, + ), + ( + "squad", + None, + api_response, + api_response_pagination, + api_response_pagination, + "", + True, + ), + ( + None, + "(0, 1)", + api_response, + api_response_pagination, + api_response_pagination, + "", + True, + ), + # Case where filtered_response is None + ( + None, + None, + api_response, + None, + None, + "There are no results found.", + False, + ), + # Case where filtered_response is an empty list + ( + None, + None, + api_response, + [], + None, + "There are no results found.", + False, + ), + ], + ) + @patch("moonshot.integrations.cli.benchmark.result.api_get_all_result") + @patch("moonshot.integrations.cli.benchmark.result._display_results") + @patch("moonshot.integrations.cli.benchmark.result.filter_data") + def test_list_results_filtered( + self, + mock_filter_data, + mock_display_results, + mock_api_get_all_results, + find, + pagination, + api_response, + filtered_response, + expected_output, + expected_log, + to_be_called, + capsys, + ): + mock_api_get_all_results.return_value = api_response + mock_filter_data.return_value = filtered_response + + class Args: + pass + + args = Args() + args.find = find + args.pagination = pagination + + result = list_results(args) + + captured = capsys.readouterr() + assert expected_log == captured.out.strip() + assert result == expected_output + + if to_be_called: + mock_display_results.assert_called_once_with(filtered_response) + else: + mock_display_results.assert_not_called() + + # ------------------------------------------------------------------------------ + # Test view_result functionality + # ------------------------------------------------------------------------------ + @pytest.mark.parametrize( + "result_filename, api_response, expected_log, expected_cookbook_call, expected_recipe_call", + [ + # Test for Cookbook Result + ( + "cookbook_result", + api_read_response_cookbook, + None, + True, + False, + ), + # Test for Recipe Result + ( + "recipe_result", + api_read_response_recipe, + None, + False, + True, + ), + # Invalid case: result_filename is None + ( + None, + api_read_response_recipe, + "[view_result]: The 'result_filename' argument must be a non-empty string and not None.", + False, + False, + ), + # Invalid case: result_filename is not a string + ( + "", + api_read_response_recipe, + "[view_result]: The 'result_filename' argument must be a non-empty string and not None.", + False, + False, + ), + ( + 123, + api_read_response_recipe, + "[view_result]: The 'result_filename' argument must be a non-empty string and not None.", + False, + False, + ), + ( + {}, + api_read_response_recipe, + "[view_result]: The 'result_filename' argument must be a non-empty string and not None.", + False, + False, + ), + ( + [], + api_read_response_recipe, + "[view_result]: The 'result_filename' argument must be a non-empty string and not None.", + False, + False, + ), + ( + (), + api_read_response_recipe, + "[view_result]: The 'result_filename' argument must be a non-empty string and not None.", + False, + False, + ), + ( + True, + api_read_response_recipe, + "[view_result]: The 'result_filename' argument must be a non-empty string and not None.", + False, + False, + ), + # Invalid case: read result is not correct + ( + "recipe_result", + "", + "[view_result]: The 'metadata' argument not found.", + False, + False, + ), + ( + "recipe_result", + 123, + "[view_result]: The 'metadata' argument not found.", + False, + False, + ), + ( + "recipe_result", + {}, + "[view_result]: The 'metadata' argument not found.", + False, + False, + ), + ( + "recipe_result", + [], + "[view_result]: The 'metadata' argument not found.", + False, + False, + ), + ( + "recipe_result", + (), + "[view_result]: The 'metadata' argument not found.", + False, + False, + ), + ( + "recipe_result", + True, + "[view_result]: The 'metadata' argument not found.", + False, + False, + ), + # No recipe or cookbooks + ( + "no_metadata_result", + {"metadata": {}}, + "[view_result]: Unable to determine cookbook or recipe", + False, + False, + ), + # Exception case: api_read_result raises an exception + ( + "exception_result", + Exception("Test Exception"), + "[view_result]: Test Exception", + False, + False, + ), + ], + ) + @patch("moonshot.integrations.cli.benchmark.result.api_read_result") + @patch("moonshot.integrations.cli.benchmark.result._display_view_recipe_result") + @patch("moonshot.integrations.cli.benchmark.result._display_view_cookbook_result") + def test_view_result( + self, + mock_display_cookbook, + mock_display_recipe, + mock_api_read, + result_filename, + api_response, + expected_log, + expected_cookbook_call, + expected_recipe_call, + capsys, + ): + if isinstance(api_response, Exception): + mock_api_read.side_effect = api_response + else: + mock_api_read.return_value = api_response + + class Args: + pass + + args = Args() + args.result_filename = result_filename + + view_result(args) + + captured = capsys.readouterr() + if expected_log: + assert expected_log in captured.out.strip() + else: + assert captured.out.strip() == "" + + if expected_cookbook_call: + mock_display_cookbook.assert_called_once() + else: + mock_display_cookbook.assert_not_called() + + if expected_recipe_call: + mock_display_recipe.assert_called_once() + else: + mock_display_recipe.assert_not_called() + + # ------------------------------------------------------------------------------ + # Test delete_result functionality + # ------------------------------------------------------------------------------ + @pytest.mark.parametrize( + "result, expected_log, to_be_called", + [ + # Valid case + ("Result 1", "[delete_result]: Result deleted.", True), + # Invalid case - result + ( + "", + "[delete_result]: The 'result' argument must be a non-empty string and not None.", + False, + ), + ( + None, + "[delete_result]: The 'result' argument must be a non-empty string and not None.", + False, + ), + ( + 123, + "[delete_result]: The 'result' argument must be a non-empty string and not None.", + False, + ), + ( + {}, + "[delete_result]: The 'result' argument must be a non-empty string and not None.", + False, + ), + ( + [], + "[delete_result]: The 'result' argument must be a non-empty string and not None.", + False, + ), + ( + (), + "[delete_result]: The 'result' argument must be a non-empty string and not None.", + False, + ), + ( + True, + "[delete_result]: The 'result' argument must be a non-empty string and not None.", + False, + ), + ], + ) + @patch("moonshot.integrations.cli.benchmark.result.api_delete_result") + def test_delete_result( + self, mock_api_delete_result, capsys, result, expected_log, to_be_called + ): + class Args: + pass + + args = Args() + args.result = result + + with patch( + "moonshot.integrations.cli.benchmark.result.console.input", + return_value="y", + ): + with patch("moonshot.integrations.cli.benchmark.result.console.print"): + delete_result(args) + + captured = capsys.readouterr() + assert expected_log == captured.out.strip() + + if to_be_called: + mock_api_delete_result.assert_called_once_with(args.result) + else: + mock_api_delete_result.assert_not_called() + + @patch("moonshot.integrations.cli.benchmark.result.console.input", return_value="y") + @patch("moonshot.integrations.cli.benchmark.result.api_delete_result") + def test_delete_result_confirm_yes(self, mock_delete, mock_input): + args = MagicMock() + args.result = "test_result_id" + + delete_result(args) + + mock_input.assert_called_once_with( + "[bold red]Are you sure you want to delete the result (y/N)? [/]" + ) + mock_delete.assert_called_once_with("test_result_id") + + @patch("moonshot.integrations.cli.benchmark.result.console.input", return_value="n") + @patch("moonshot.integrations.cli.benchmark.result.api_delete_result") + def test_delete_result_confirm_no(self, mock_delete, mock_input): + args = MagicMock() + args.result = "test_result_id" + + delete_result(args) + + mock_input.assert_called_once_with( + "[bold red]Are you sure you want to delete the result (y/N)? [/]" + ) + mock_delete.assert_not_called() + + @patch("moonshot.integrations.cli.benchmark.result.console.input", return_value="n") + @patch("moonshot.integrations.cli.benchmark.result.console.print") + @patch("moonshot.integrations.cli.benchmark.result.api_delete_result") + def test_delete_result_cancelled_output(self, mock_delete, mock_print, mock_input): + args = MagicMock() + args.result = "test_result_id" + + delete_result(args) + + mock_input.assert_called_once_with( + "[bold red]Are you sure you want to delete the result (y/N)? [/]" + ) + mock_print.assert_called_once_with("[bold yellow]Result deletion cancelled.[/]") + mock_delete.assert_not_called() diff --git a/tests/unit-tests/cli/test_run.py b/tests/unit-tests/cli/test_run.py new file mode 100644 index 00000000..840e5ae5 --- /dev/null +++ b/tests/unit-tests/cli/test_run.py @@ -0,0 +1,531 @@ +from unittest.mock import patch + +import pytest + +from moonshot.integrations.cli.benchmark.run import list_runs, view_run +from moonshot.src.runners.runner_type import RunnerType + + +class TestCollectionCliRun: + api_response = [ + { + "run_id": 1, + "runner_id": "my-new-recipe-runner", + "runner_type": RunnerType.BENCHMARK, + "runner_args": { + "recipes": ["bbq"], + "num_of_prompts": 1, + "random_seed": 1, + "system_prompt": "You are an intelligent AI", + "runner_processing_module": "benchmarking", + "run_processing_module": "benchmarking-run", + }, + "endpoints": ["openai-gpt35-turbo"], + "runs_file": "/generated-outputs/runs/my-new-recipe-runner.json", + "start_time": 1723873936.436674, + "end_time": 1723873968.6472352, + "duration": 32, + "error_messages": [], + "raw_runs": {"bbq": "some run"}, + } + ] + + api_response_pagination = [ + { + "run_id": 1, + "runner_id": "my-new-recipe-runner", + "runner_type": RunnerType.BENCHMARK, + "runner_args": { + "recipes": ["bbq"], + "num_of_prompts": 1, + "random_seed": 1, + "system_prompt": "You are an intelligent AI", + "runner_processing_module": "benchmarking", + "run_processing_module": "benchmarking-run", + }, + "endpoints": ["openai-gpt35-turbo"], + "runs_file": "/generated-outputs/runs/my-new-recipe-runner.json", + "start_time": 1723873936.436674, + "end_time": 1723873968.6472352, + "duration": 32, + "error_messages": [], + "raw_runs": {"bbq": "some run"}, + "idx": 1, + } + ] + + @pytest.fixture(autouse=True) + def init(self): + # Perform tests + yield + + # ------------------------------------------------------------------------------ + # Test list_runs functionality with non-mocked filter-data + # ------------------------------------------------------------------------------ + @pytest.mark.parametrize( + "find, pagination, api_response, expected_output, expected_log, to_be_called", + [ + # Valid cases + ( + None, + None, + api_response, + api_response, + "", + True, + ), + # No runs + ( + None, + None, + [], + None, + "There are no runs found.", + False, + ), + ( + "my-new-recipe-runner", + None, + api_response, + api_response, + "", + True, + ), + ( + None, + "(1, 1)", + api_response, + api_response_pagination, + "", + True, + ), + ( + "Squad", + "(1, 1)", + api_response, + None, + "There are no runs found.", + False, + ), + # Invalid cases for find + ( + "", + None, + api_response, + None, + "[list_runs]: The 'find' argument must be a non-empty string and not None.", + False, + ), + ( + 99, + None, + api_response, + None, + "[list_runs]: The 'find' argument must be a non-empty string and not None.", + False, + ), + ( + {}, + None, + api_response, + None, + "[list_runs]: The 'find' argument must be a non-empty string and not None.", + False, + ), + ( + [], + None, + api_response, + None, + "[list_runs]: The 'find' argument must be a non-empty string and not None.", + False, + ), + ( + (), + None, + api_response, + None, + "[list_runs]: The 'find' argument must be a non-empty string and not None.", + False, + ), + ( + True, + None, + api_response, + None, + "[list_runs]: The 'find' argument must be a non-empty string and not None.", + False, + ), + # Invalid cases for pagination + ( + None, + "", + api_response, + None, + "[list_runs]: The 'pagination' argument must be a non-empty string and not None.", + False, + ), + ( + None, + 99, + api_response, + None, + "[list_runs]: The 'pagination' argument must be a non-empty string and not None.", + False, + ), + ( + None, + {}, + api_response, + None, + "[list_runs]: The 'pagination' argument must be a non-empty string and not None.", + False, + ), + ( + None, + [], + api_response, + None, + "[list_runs]: The 'pagination' argument must be a non-empty string and not None.", + False, + ), + ( + None, + (), + api_response, + None, + "[list_runs]: The 'pagination' argument must be a non-empty string and not None.", + False, + ), + ( + None, + True, + api_response, + None, + "[list_runs]: The 'pagination' argument must be a non-empty string and not None.", + False, + ), + ( + None, + True, + api_response, + None, + "[list_runs]: The 'pagination' argument must be a non-empty string and not None.", + False, + ), + ( + None, + "(1, 'a')", + api_response, + None, + "[list_runs]: The 'pagination' argument must be a tuple of two integers.", + False, + ), + ( + None, + "(1, 2, 3)", + api_response, + None, + "[list_runs]: The 'pagination' argument must be a tuple of two integers.", + False, + ), + ( + None, + "(1, )", + api_response, + None, + "[list_runs]: The 'pagination' argument must be a tuple of two integers.", + False, + ), + ( + None, + "(0, 1)", + api_response, + None, + "[list_runs]: Invalid page number or page size. Page number and page size should start from 1.", + False, + ), + ( + None, + "(1, 0)", + api_response, + None, + "[list_runs]: Invalid page number or page size. Page number and page size should start from 1.", + False, + ), + ( + None, + "(0, 0)", + api_response, + None, + "[list_runs]: Invalid page number or page size. Page number and page size should start from 1.", + False, + ), + ( + None, + "(1, -1)", + api_response, + None, + "[list_runs]: Invalid page number or page size. Page number and page size should start from 1.", + False, + ), + ( + None, + "(-1, 1)", + api_response, + None, + "[list_runs]: Invalid page number or page size. Page number and page size should start from 1.", + False, + ), + ( + None, + "(-1, -1)", + api_response, + None, + "[list_runs]: Invalid page number or page size. Page number and page size should start from 1.", + False, + ), + # Exception case + ( + None, + None, + api_response, + None, + "[list_runs]: An error has occurred while listing runs.", + False, + ), + ], + ) + @patch("moonshot.integrations.cli.benchmark.run.api_get_all_run") + @patch("moonshot.integrations.cli.benchmark.run._display_runs") + def test_list_runs( + self, + mock_display_runs, + mock_api_get_all_runs, + find, + pagination, + api_response, + expected_output, + expected_log, + to_be_called, + capsys, + ): + if "error" in expected_log: + mock_api_get_all_runs.side_effect = Exception( + "An error has occurred while listing runs." + ) + else: + mock_api_get_all_runs.return_value = api_response + + class Args: + pass + + args = Args() + args.find = find + args.pagination = pagination + + run = list_runs(args) + + captured = capsys.readouterr() + assert expected_log == captured.out.strip() + assert run == expected_output + + if to_be_called: + mock_display_runs.assert_called_once_with(api_response) + else: + mock_display_runs.assert_not_called() + + # ------------------------------------------------------------------------------ + # Test list_runs functionality with mocked filter-data + # ------------------------------------------------------------------------------ + @pytest.mark.parametrize( + "find, pagination, api_response, filtered_response, expected_output, expected_log, to_be_called", + [ + ( + None, + None, + api_response, + api_response_pagination, + api_response_pagination, + "", + True, + ), + ( + "my-new-recipe", + None, + api_response, + api_response_pagination, + api_response_pagination, + "", + True, + ), + ( + None, + "(0, 1)", + api_response, + api_response_pagination, + api_response_pagination, + "", + True, + ), + # Case where filtered_response is None + ( + None, + None, + api_response, + None, + None, + "There are no runs found.", + False, + ), + # Case where filtered_response is an empty list + ( + None, + None, + api_response, + [], + None, + "There are no runs found.", + False, + ), + ], + ) + @patch("moonshot.integrations.cli.benchmark.run.api_get_all_run") + @patch("moonshot.integrations.cli.benchmark.run._display_runs") + @patch("moonshot.integrations.cli.benchmark.run.filter_data") + def test_list_runs_filtered( + self, + mock_filter_data, + mock_display_runs, + mock_api_get_all_runs, + find, + pagination, + api_response, + filtered_response, + expected_output, + expected_log, + to_be_called, + capsys, + ): + mock_api_get_all_runs.return_value = api_response + mock_filter_data.return_value = filtered_response + + class Args: + pass + + args = Args() + args.find = find + args.pagination = pagination + + run = list_runs(args) + + captured = capsys.readouterr() + assert expected_log == captured.out.strip() + assert run == expected_output + + if to_be_called: + mock_display_runs.assert_called_once_with(filtered_response) + else: + mock_display_runs.assert_not_called() + + # ------------------------------------------------------------------------------ + # Test view_run functionality + # ------------------------------------------------------------------------------ + @pytest.mark.parametrize( + "runner_id, api_response, expected_log, expected_call", + [ + # Test for Cookbook Run + ( + "runner_id", + api_response, + None, + True, + ), + # Invalid case: runner_id is None + ( + None, + api_response, + "[view_run]: The 'runner_id' argument must be a non-empty string and not None.", + False, + ), + # Invalid case: runner_id is not a string + ( + "", + api_response, + "[view_run]: The 'runner_id' argument must be a non-empty string and not None.", + False, + ), + ( + 123, + api_response, + "[view_run]: The 'runner_id' argument must be a non-empty string and not None.", + False, + ), + ( + {}, + api_response, + "[view_run]: The 'runner_id' argument must be a non-empty string and not None.", + False, + ), + ( + [], + api_response, + "[view_run]: The 'runner_id' argument must be a non-empty string and not None.", + False, + ), + ( + (), + api_response, + "[view_run]: The 'runner_id' argument must be a non-empty string and not None.", + False, + ), + ( + True, + api_response, + "[view_run]: The 'runner_id' argument must be a non-empty string and not None.", + False, + ), + # Exception case: api_get_all_run raises an exception + ( + "exception_run", + Exception("Test Exception"), + "[view_run]: Test Exception", + False, + ), + ], + ) + @patch("moonshot.integrations.cli.benchmark.run.api_get_all_run") + @patch("moonshot.integrations.cli.benchmark.run._display_runs") + def test_view_run( + self, + mock_display_runs, + mock_api_get_all_run, + runner_id, + api_response, + expected_log, + expected_call, + capsys, + ): + if isinstance(api_response, Exception): + mock_api_get_all_run.side_effect = api_response + else: + mock_api_get_all_run.return_value = api_response + + class Args: + pass + + args = Args() + args.runner_id = runner_id + + view_run(args) + + captured = capsys.readouterr() + if expected_log: + assert expected_log in captured.out.strip() + else: + assert captured.out.strip() == "" + + if expected_call: + mock_display_runs.assert_called_once() + else: + mock_display_runs.assert_not_called() diff --git a/tests/unit-tests/cli/test_runner.py b/tests/unit-tests/cli/test_runner.py new file mode 100644 index 00000000..f96a6295 --- /dev/null +++ b/tests/unit-tests/cli/test_runner.py @@ -0,0 +1,414 @@ +from unittest.mock import MagicMock, patch + +import pytest + +from moonshot.integrations.cli.benchmark.runner import ( + delete_runner, + list_runners, + view_runner, +) + + +class TestCollectionCliRunner: + api_response_read_runner = [ + { + "id": "my-new-recipe-runner", + "name": "my new recipe runner", + "database_file": "generated-outputs/databases/my-new-recipe-runner.db", + "endpoints": ["openai-gpt35-turbo"], + "description": "", + } + ] + api_response_runs = [ + { + "metadata": { + "id": "my-new-recipe-runner", + "start_time": "2024-08-17 13:52:16", + "end_time": "2024-08-17 13:52:48", + "duration": 32, + "status": "completed", + "recipes": ["bbq", "mmlu"], + "cookbooks": None, + "endpoints": ["openai-gpt35-turbo"], + "num_of_prompts": 1, + "random_seed": 1, + "system_prompt": "You are an intelligent AI", + }, + "results": { + "recipes": [ + { + "id": "bbq", + "details": [ + { + "model_id": "openai-gpt35-turbo", + "dataset_id": "bbq-lite-disability-status-ambiguous", + "prompt_template_id": "mcq-template", + "data": [ + { + "prompt": '"Some prompts.', + "predicted_result": "A. Can't be determined", + "target": "A. Can't be determined", + "duration": 8.763961791002657, + } + ], + "metrics": [ + { + "accuracy": 100.0, + "grading_criteria": {"accuracy": 100.0}, + } + ], + } + ], + } + ], + "evaluation_summary": [ + { + "model_id": "openai-gpt35-turbo", + "num_of_prompts": 1, + "avg_grade_value": 0.0, + "grade": "E", + } + ], + "grading_scale": { + "A": [80, 100], + "B": [60, 79], + "C": [40, 59], + "D": [20, 39], + "E": [0, 19], + }, + "total_num_of_prompts": 1, + }, + } + ] + api_response_session = None + + @pytest.fixture(autouse=True) + def init(self): + # Perform tests + yield + + # ------------------------------------------------------------------------------ + # Test list_runner functionality # TODO + # ------------------------------------------------------------------------------ + + # ------------------------------------------------------------------------------ + # Test view_runner functionality + # ------------------------------------------------------------------------------ + @pytest.mark.parametrize( + "runner_id, api_response_read_runner, api_response_runs, api_response_session, expected_log, to_be_called, read_runner_called, get_all_run_called, load_session_called", + [ + # Valid case + ( + "1", + api_response_read_runner, + api_response_runs, + api_response_session, + "", + True, + True, + True, + True, + ), + # Invalid case: runner_id is None + ( + None, + api_response_read_runner, + api_response_runs, + api_response_session, + "[view_runner]: The 'runner' argument must be a non-empty string and not None.", + False, + False, + False, + False, + ), + # Invalid case: runner_id is not a string + ( + "", + api_response_read_runner, + api_response_runs, + api_response_session, + "[view_runner]: The 'runner' argument must be a non-empty string and not None.", + False, + False, + False, + False, + ), + ( + 123, + api_response_read_runner, + api_response_runs, + api_response_session, + "[view_runner]: The 'runner' argument must be a non-empty string and not None.", + False, + False, + False, + False, + ), + ( + {}, + api_response_read_runner, + api_response_runs, + api_response_session, + "[view_runner]: The 'runner' argument must be a non-empty string and not None.", + False, + False, + False, + False, + ), + ( + [], + api_response_read_runner, + api_response_runs, + api_response_session, + "[view_runner]: The 'runner' argument must be a non-empty string and not None.", + False, + False, + False, + False, + ), + ( + (), + api_response_read_runner, + api_response_runs, + api_response_session, + "[view_runner]: The 'runner' argument must be a non-empty string and not None.", + False, + False, + False, + False, + ), + ( + True, + api_response_read_runner, + api_response_runs, + api_response_session, + "[view_runner]: The 'runner' argument must be a non-empty string and not None.", + False, + False, + False, + False, + ), + # Exception case: api_read_runner raises an exception + ( + "1", + api_response_read_runner, + api_response_runs, + api_response_session, + "[view_runner]: An error has occurred while reading the runner.", + False, + True, + False, + False, + ), + # Exception case: api_get_all_run raises an exception + ( + "1", + api_response_read_runner, + api_response_runs, + api_response_session, + "[view_runner]: An error has occurred while reading the runs.", + False, + True, + True, + False, + ), + # Exception case: api_load_session raises an exception + ( + "1", + api_response_read_runner, + api_response_runs, + api_response_session, + "[view_runner]: An error has occurred while reading the sessions.", + False, + True, + True, + True, + ), + ], + ) + @patch("moonshot.integrations.cli.benchmark.runner.api_read_runner") + @patch("moonshot.integrations.cli.benchmark.runner.api_get_all_run") + @patch("moonshot.integrations.cli.benchmark.runner.api_load_session") + @patch("moonshot.integrations.cli.benchmark.runner._display_runners") + def test_view_runner( + self, + mock_display_runners, + mock_api_load_session, + mock_api_get_all_run, + mock_api_read_runner, + runner_id, + api_response_read_runner, + api_response_runs, + api_response_session, + expected_log, + to_be_called, + read_runner_called, + get_all_run_called, + load_session_called, + capsys, + ): + if "error has occurred while reading the runner." in expected_log: + mock_api_read_runner.side_effect = Exception( + "An error has occurred while reading the runner." + ) + else: + mock_api_read_runner.return_value = api_response_read_runner + + if "error has occurred while reading the runs." in expected_log: + mock_api_get_all_run.side_effect = Exception( + "An error has occurred while reading the runs." + ) + else: + mock_api_get_all_run.return_value = api_response_runs + + if "error has occurred while reading the sessions." in expected_log: + mock_api_load_session.side_effect = Exception( + "An error has occurred while reading the sessions." + ) + else: + mock_api_load_session.return_value = api_response_session + + class Args: + pass + + args = Args() + args.runner = runner_id + + view_runner(args) + + captured = capsys.readouterr() + assert expected_log == captured.out.strip() + + if to_be_called: + mock_display_runners.assert_called_once_with( + [api_response_read_runner], api_response_runs, [api_response_session] + ) + else: + mock_display_runners.assert_not_called() + + if read_runner_called: + mock_api_read_runner.assert_called_once() + else: + mock_api_read_runner.assert_not_called() + + if get_all_run_called: + mock_api_get_all_run.assert_called_once() + else: + mock_api_get_all_run.assert_not_called() + + if load_session_called: + mock_api_load_session.assert_called_once() + else: + mock_api_load_session.assert_not_called() + + # ------------------------------------------------------------------------------ + # Test delete_runner functionality + # ------------------------------------------------------------------------------ + @pytest.mark.parametrize( + "runner, expected_log, to_be_called", + [ + # Valid case + ("Runner 1", "[delete_runner]: Runner deleted.", True), + # Invalid case - runner + ( + "", + "[delete_runner]: The 'runner' argument must be a non-empty string and not None.", + False, + ), + ( + None, + "[delete_runner]: The 'runner' argument must be a non-empty string and not None.", + False, + ), + ( + 123, + "[delete_runner]: The 'runner' argument must be a non-empty string and not None.", + False, + ), + ( + {}, + "[delete_runner]: The 'runner' argument must be a non-empty string and not None.", + False, + ), + ( + [], + "[delete_runner]: The 'runner' argument must be a non-empty string and not None.", + False, + ), + ( + (), + "[delete_runner]: The 'runner' argument must be a non-empty string and not None.", + False, + ), + ( + True, + "[delete_runner]: The 'runner' argument must be a non-empty string and not None.", + False, + ), + ], + ) + @patch("moonshot.integrations.cli.benchmark.runner.api_delete_runner") + def test_delete_runner( + self, mock_api_delete_runner, capsys, runner, expected_log, to_be_called + ): + class Args: + pass + + args = Args() + args.runner = runner + + with patch( + "moonshot.integrations.cli.benchmark.runner.console.input", + return_value="y", + ): + with patch("moonshot.integrations.cli.benchmark.runner.console.print"): + delete_runner(args) + + captured = capsys.readouterr() + assert expected_log == captured.out.strip() + + if to_be_called: + mock_api_delete_runner.assert_called_once_with(args.runner) + else: + mock_api_delete_runner.assert_not_called() + + @patch("moonshot.integrations.cli.benchmark.runner.console.input", return_value="y") + @patch("moonshot.integrations.cli.benchmark.runner.api_delete_runner") + def test_delete_runner_confirm_yes(self, mock_delete, mock_input): + args = MagicMock() + args.runner = "test_runner_id" + + delete_runner(args) + + mock_input.assert_called_once_with( + "[bold red]Are you sure you want to delete the runner (y/N)? [/]" + ) + mock_delete.assert_called_once_with("test_runner_id") + + @patch("moonshot.integrations.cli.benchmark.runner.console.input", return_value="n") + @patch("moonshot.integrations.cli.benchmark.runner.api_delete_runner") + def test_delete_runner_confirm_no(self, mock_delete, mock_input): + args = MagicMock() + args.runner = "test_runner_id" + + delete_runner(args) + + mock_input.assert_called_once_with( + "[bold red]Are you sure you want to delete the runner (y/N)? [/]" + ) + mock_delete.assert_not_called() + + @patch("moonshot.integrations.cli.benchmark.runner.console.input", return_value="n") + @patch("moonshot.integrations.cli.benchmark.runner.console.print") + @patch("moonshot.integrations.cli.benchmark.runner.api_delete_runner") + def test_delete_runner_cancelled_output(self, mock_delete, mock_print, mock_input): + args = MagicMock() + args.runner = "test_runner_id" + + delete_runner(args) + + mock_input.assert_called_once_with( + "[bold red]Are you sure you want to delete the runner (y/N)? [/]" + ) + mock_print.assert_called_once_with("[bold yellow]Runner deletion cancelled.[/]") + mock_delete.assert_not_called()