diff --git a/sdk/ai/azure-ai-generative/azure/ai/generative/evaluate/_constants.py b/sdk/ai/azure-ai-generative/azure/ai/generative/evaluate/_constants.py index 28b012efd134..e767ac2501cb 100644 --- a/sdk/ai/azure-ai-generative/azure/ai/generative/evaluate/_constants.py +++ b/sdk/ai/azure-ai-generative/azure/ai/generative/evaluate/_constants.py @@ -15,6 +15,17 @@ } +class ContentSafetyMetrics: + """ + Evaluation metrics supported by evaluation in azure-ai-generative sdk + """ + + HATE_FAIRNESS = "hate_fairness" + VIOLENCE = "violence" + SELF_HARM = "self_harm" + SEXUAL = "sexual" + + class EvaluationMetrics: """ Evaluation metrics supported by evaluation in azure-ai-generative sdk @@ -31,10 +42,6 @@ class EvaluationMetrics: RETRIEVAL_SCORE = "retrieval_score" GROUNDING_SCORE = "grounding_score" GENERATION_SCORE = "generation_score" - HATE_FAIRNESS = "hate_fairness" - VIOLENCE = "violence" - SELF_HARM = "self_harm" - SEXUAL = "sexual" class QaMetrics: @@ -50,10 +57,10 @@ class QaMetrics: EvaluationMetrics.GPT_FLUENCY, EvaluationMetrics.GPT_SIMILARITY, EvaluationMetrics.F1_SCORE, - EvaluationMetrics.HATE_FAIRNESS, - EvaluationMetrics.VIOLENCE, - EvaluationMetrics.SELF_HARM, - EvaluationMetrics.SEXUAL + ContentSafetyMetrics.HATE_FAIRNESS, + ContentSafetyMetrics.VIOLENCE, + ContentSafetyMetrics.SELF_HARM, + ContentSafetyMetrics.SEXUAL ] @@ -69,10 +76,10 @@ class ChatMetrics: EvaluationMetrics.GPT_GROUNDEDNESS, EvaluationMetrics.GPT_RELEVANCE, EvaluationMetrics.GPT_RETRIEVAL_SCORE, - EvaluationMetrics.HATE_FAIRNESS, - EvaluationMetrics.VIOLENCE, - EvaluationMetrics.SELF_HARM, - EvaluationMetrics.SEXUAL + ContentSafetyMetrics.HATE_FAIRNESS, + ContentSafetyMetrics.VIOLENCE, + ContentSafetyMetrics.SELF_HARM, + ContentSafetyMetrics.SEXUAL ] @@ -85,3 +92,10 @@ class ChatMetrics: QA: QaMetrics, CHAT: ChatMetrics } + +CONTENT_SAFETY_METRICS_LIST = [ + ContentSafetyMetrics.SEXUAL, + ContentSafetyMetrics.SELF_HARM, + ContentSafetyMetrics.VIOLENCE, + ContentSafetyMetrics.HATE_FAIRNESS +] diff --git a/sdk/ai/azure-ai-generative/azure/ai/generative/evaluate/_metric_handler.py b/sdk/ai/azure-ai-generative/azure/ai/generative/evaluate/_metric_handler.py index b79dd1152559..2f1cb62fb72f 100644 --- a/sdk/ai/azure-ai-generative/azure/ai/generative/evaluate/_metric_handler.py +++ b/sdk/ai/azure-ai-generative/azure/ai/generative/evaluate/_metric_handler.py @@ -8,7 +8,7 @@ import pandas as pd -from azure.ai.generative.evaluate._constants import TASK_TYPE_TO_METRICS_MAPPING, CHAT +from azure.ai.generative.evaluate._constants import TASK_TYPE_TO_METRICS_MAPPING, CHAT, CONTENT_SAFETY_METRICS_LIST from ._user_agent import USER_AGENT from ._utils import run_pf_flow_with_dict_list, df_to_dict_list, wait_for_pf_run_to_complete @@ -23,14 +23,14 @@ class MetricHandler(object): def __init__( - self, - task_type, - prediction_data: pd.DataFrame, - input_output_data: pd.DataFrame, - test_data, - metrics_mapping=None, - metrics=None, - data_mapping: Optional[Dict] = None, + self, + task_type, + prediction_data: pd.DataFrame, + input_output_data: pd.DataFrame, + test_data, + metrics_mapping=None, + metrics=None, + data_mapping: Optional[Dict] = None, ): self.task_type = task_type self.prediction_data = prediction_data @@ -72,7 +72,20 @@ def calculate_metrics(self) -> Dict: pf_client = PFClient(user_agent=USER_AGENT) - openai_config = self.metrics_mapping["openai_params"] + openai_config = self.metrics_mapping.get("openai_params") + + if openai_config is None: + if all(m in CONTENT_SAFETY_METRICS_LIST for m in metrics): + openai_config = { + "api_key": "api_key", + "api_base": "api_base", + "api_version": "api_version", + "api_type": "azure", + "deployment_id" : "deployment_id" + } + else: + raise Exception("model_config is required for metrics other than content safety metrics") + conn_name = "openai_connection" deployment_id = openai_config["deployment_id"] if not openai_config["api_type"] or openai_config["api_type"] == "azure": @@ -98,7 +111,8 @@ def calculate_metrics(self) -> Dict: nodes_list = NODE_LIST_BY_TASK[self.task_type] if self.task_type == CHAT: - pf_run = run_pf_flow_with_dict_list(flow_path, dict_list) + pf_run = run_pf_flow_with_dict_list(flow_path, dict_list, flow_params={ + "connections": {node: {"connection": conn_name}} for node in nodes_list}) else: pf_run = run_pf_flow_with_dict_list( flow_path, dict_list, flow_params={"connections": {node: connection_override for node in nodes_list}} diff --git a/sdk/ai/azure-ai-generative/tests/conftest.py b/sdk/ai/azure-ai-generative/tests/conftest.py index 670692fbf117..928511533ca0 100644 --- a/sdk/ai/azure-ai-generative/tests/conftest.py +++ b/sdk/ai/azure-ai-generative/tests/conftest.py @@ -137,6 +137,8 @@ def sanitized_environment_variables( "AI_SUBSCRIPTION_ID": "00000000-0000-0000-0000-000000000", "AI_RESOURCE_GROUP": "00000", "AI_WORKSPACE_NAME": "00000", + "AI_PROJECT_NAME": "00000", + "AI_TEAM_NAME": "00000", "AI_FEATURE_STORE_NAME": "00000", "AI_TEST_STORAGE_ACCOUNT_NAME": "teststorageaccount", "AI_TEST_STORAGE_ACCOUNT_PRIMARY_KEY": fake_datastore_key, diff --git a/sdk/ai/azure-ai-generative/tests/evaluate/e2etests/test_evaluate_e2e.py b/sdk/ai/azure-ai-generative/tests/evaluate/e2etests/test_evaluate_e2e.py index 18f679fd5f4d..472ee2040183 100644 --- a/sdk/ai/azure-ai-generative/tests/evaluate/e2etests/test_evaluate_e2e.py +++ b/sdk/ai/azure-ai-generative/tests/evaluate/e2etests/test_evaluate_e2e.py @@ -65,6 +65,75 @@ def test_evaluate_built_in_metrics(self, e2e_openai_api_base, e2e_openai_api_key assert tabular_result["gpt_groundedness"][0] in [1,2] assert tabular_result["gpt_groundedness"][1] in [5, 4] + def test_evaluate_built_in_metrics_missing_model_config(self, tmpdir): + test_data = [ + {"context": "Some are reported as not having been wanted at all.", + "question": "", + "answer": "All are reported as being completely and fully wanted." + }, + {"question": "How do you log a model?", + "context": "There are a few ways to log models in Azure Machine Learning. \n\nOne way is to use the `register_model()` method of the `Run` object. The `register_model()` method logs a model file in the Azure Machine Learning service workspace and makes it available for deployment. Here's an example:\n\n```python\nfrom azureml.core import Model\n\nmodel_path = '.\/outputs\/my_model.pkl'\nmodel = Model.register(workspace=ws, model_path=model_path, model_name='my_model')\n```\n\nThis code registers the model file located at `model_path` to the Azure Machine Learning service workspace with the name `my_model`. \n\nAnother way to log a model is to save it as an output of a `Run`. If your model generation code is part of a script or Jupyter notebook that runs as an Azure Machine Learning experiment, you can save the model file as an output of the `Run` object. Here's an example:\n\n```python\nfrom sklearn.linear_model import LogisticRegression\nfrom azureml.core.run import Run\n\n# Initialize a run object\nrun = Run.get_context()\n\n# Train your model\nX_train, y_train = ...\nclf = LogisticRegression().fit(X_train, y_train)\n\n# Save the model to the Run object's outputs directory\nmodel_path = 'outputs\/model.pkl'\njoblib.dump(value=clf, filename=model_path)\n\n# Log the model as a run artifact\nrun.upload_file(name=model_path, path_or_stream=model_path)\n```\n\nIn this code, `Run.get_context()` retrieves the current run context object, which you can use to track metadata and metrics for the run. After training your model, you can use `joblib.dump()` to save the model to a file, and then log the file as an artifact of the run using `run.upload_file()`.", + "answer": "There are a few ways to log models in Azure Machine Learning. \n\nOne way is to use the `register_model()` method of the `Run` object. The `register_model()` method logs a model file in the Azure Machine Learning service workspace and makes it available for deployment. Here's an example:\n\n```python\nfrom azureml.core import Model\n\nmodel_path = '.\/outputs\/my_model.pkl'\nmodel = Model.register(workspace=ws, model_path=model_path, model_name='my_model')\n```\n\nThis code registers the model file located at `model_path` to the Azure Machine Learning service workspace with the name `my_model`. \n\nAnother way to log a model is to save it as an output of a `Run`. If your model generation code is part of a script or Jupyter notebook that runs as an Azure Machine Learning experiment, you can save the model file as an output of the `Run` object. Here's an example:\n\n```python\nfrom sklearn.linear_model import LogisticRegression\nfrom azureml.core.run import Run\n\n# Initialize a run object\nrun = Run.get_context()\n\n# Train your model\nX_train, y_train = ...\nclf = LogisticRegression().fit(X_train, y_train)\n\n# Save the model to the Run object's outputs directory\nmodel_path = 'outputs\/model.pkl'\njoblib.dump(value=clf, filename=model_path)\n\n# Log the model as a run artifact\nrun.upload_file(name=model_path, path_or_stream=model_path)\n```\n\nIn this code, `Run.get_context()` retrieves the current run context object, which you can use to track metadata and metrics for the run. After training your model, you can use `joblib.dump()` to save the model to a file, and then log the file as an artifact of the run using `run.upload_file()`." + }, + ] + + with tmpdir.as_cwd(): + output_path = tmpdir + "/evaluation_output" + + with pytest.raises(Exception) as ex: + result = evaluate( # This will log metric/artifacts using mlflow + evaluation_name="rag-chat-1", + data=test_data, + task_type="qa", + metrics_list=["gpt_groundedness"], + data_mapping={ + "question": "question", + "context": "context", + "y_pred": "answer", + "y_test": "truth", + }, + output_path=output_path + ) + + def test_evaluate_built_in_content_safety_metrics(self, ai_client, tmpdir): + test_data = [ + {"context": "Some are reported as not having been wanted at all.", + "question": "How is weather in Seattle?", + "answer": "All are reported as being completely and fully wanted." + }, + {"question": "How do you log a model?", + "context": "There are a few ways to log models in Azure Machine Learning. \n\nOne way is to use the `register_model()` method of the `Run` object. The `register_model()` method logs a model file in the Azure Machine Learning service workspace and makes it available for deployment. Here's an example:\n\n```python\nfrom azureml.core import Model\n\nmodel_path = '.\/outputs\/my_model.pkl'\nmodel = Model.register(workspace=ws, model_path=model_path, model_name='my_model')\n```\n\nThis code registers the model file located at `model_path` to the Azure Machine Learning service workspace with the name `my_model`. \n\nAnother way to log a model is to save it as an output of a `Run`. If your model generation code is part of a script or Jupyter notebook that runs as an Azure Machine Learning experiment, you can save the model file as an output of the `Run` object. Here's an example:\n\n```python\nfrom sklearn.linear_model import LogisticRegression\nfrom azureml.core.run import Run\n\n# Initialize a run object\nrun = Run.get_context()\n\n# Train your model\nX_train, y_train = ...\nclf = LogisticRegression().fit(X_train, y_train)\n\n# Save the model to the Run object's outputs directory\nmodel_path = 'outputs\/model.pkl'\njoblib.dump(value=clf, filename=model_path)\n\n# Log the model as a run artifact\nrun.upload_file(name=model_path, path_or_stream=model_path)\n```\n\nIn this code, `Run.get_context()` retrieves the current run context object, which you can use to track metadata and metrics for the run. After training your model, you can use `joblib.dump()` to save the model to a file, and then log the file as an artifact of the run using `run.upload_file()`.", + "answer": "There are a few ways to log models in Azure Machine Learning. \n\nOne way is to use the `register_model()` method of the `Run` object. The `register_model()` method logs a model file in the Azure Machine Learning service workspace and makes it available for deployment. Here's an example:\n\n```python\nfrom azureml.core import Model\n\nmodel_path = '.\/outputs\/my_model.pkl'\nmodel = Model.register(workspace=ws, model_path=model_path, model_name='my_model')\n```\n\nThis code registers the model file located at `model_path` to the Azure Machine Learning service workspace with the name `my_model`. \n\nAnother way to log a model is to save it as an output of a `Run`. If your model generation code is part of a script or Jupyter notebook that runs as an Azure Machine Learning experiment, you can save the model file as an output of the `Run` object. Here's an example:\n\n```python\nfrom sklearn.linear_model import LogisticRegression\nfrom azureml.core.run import Run\n\n# Initialize a run object\nrun = Run.get_context()\n\n# Train your model\nX_train, y_train = ...\nclf = LogisticRegression().fit(X_train, y_train)\n\n# Save the model to the Run object's outputs directory\nmodel_path = 'outputs\/model.pkl'\njoblib.dump(value=clf, filename=model_path)\n\n# Log the model as a run artifact\nrun.upload_file(name=model_path, path_or_stream=model_path)\n```\n\nIn this code, `Run.get_context()` retrieves the current run context object, which you can use to track metadata and metrics for the run. After training your model, you can use `joblib.dump()` to save the model to a file, and then log the file as an artifact of the run using `run.upload_file()`." + }, + ] + + with tmpdir.as_cwd(): + output_path = tmpdir + "/evaluation_output" + tracking_uri = ai_client.tracking_uri + + result = evaluate( # This will log metric/artifacts using mlflow + evaluation_name="rag-chat-1", + data=test_data, + task_type="qa", + metrics_list=["violence"], + data_mapping={ + "question": "question", + "context": "context", + "y_pred": "answer", + "y_test": "truth", + }, + output_path=output_path, + tracking_uri=tracking_uri + ) + + metrics_summary = result.metrics_summary + tabular_result = pd.read_json(os.path.join(output_path, "eval_results.jsonl"), lines=True) + + assert "violence_defect_rate" in metrics_summary.keys() + assert tabular_result["violence"][0] in ["Safe"] + assert tabular_result["violence_score"][0] in [0.0] + assert tabular_result["violence_reasoning"] is not None + def test_duplicate_metrics_name(self, e2e_openai_api_base, e2e_openai_api_key, e2e_openai_completion_deployment_name, tmpdir): test_data = [