Users/singankit/model config optional (#34812)

* Adding tests to capture groundedness with expected values * Changes to make model config optional * Adding test for missing model config * Fixing tests * Fixing pylint error
Azure · Mar 18, 2024 · f9c0eda · f9c0eda
1 parent 6272970
commit f9c0eda
Show file tree

Hide file tree

Showing 4 changed files with 122 additions and 23 deletions.
diff --git a/sdk/ai/azure-ai-generative/azure/ai/generative/evaluate/_constants.py b/sdk/ai/azure-ai-generative/azure/ai/generative/evaluate/_constants.py
@@ -15,6 +15,17 @@
 }
 
 
+class ContentSafetyMetrics:
+    """
+    Evaluation metrics supported by evaluation in azure-ai-generative sdk
+    """
+
+    HATE_FAIRNESS = "hate_fairness"
+    VIOLENCE = "violence"
+    SELF_HARM = "self_harm"
+    SEXUAL = "sexual"
+
+
 class EvaluationMetrics:
     """
     Evaluation metrics supported by evaluation in azure-ai-generative sdk
@@ -31,10 +42,6 @@ class EvaluationMetrics:
     RETRIEVAL_SCORE = "retrieval_score"
     GROUNDING_SCORE = "grounding_score"
     GENERATION_SCORE = "generation_score"
-    HATE_FAIRNESS = "hate_fairness"
-    VIOLENCE = "violence"
-    SELF_HARM = "self_harm"
-    SEXUAL = "sexual"
 
 
 class QaMetrics:
@@ -50,10 +57,10 @@ class QaMetrics:
         EvaluationMetrics.GPT_FLUENCY,
         EvaluationMetrics.GPT_SIMILARITY,
         EvaluationMetrics.F1_SCORE,
-        EvaluationMetrics.HATE_FAIRNESS,
-        EvaluationMetrics.VIOLENCE,
-        EvaluationMetrics.SELF_HARM,
-        EvaluationMetrics.SEXUAL
+        ContentSafetyMetrics.HATE_FAIRNESS,
+        ContentSafetyMetrics.VIOLENCE,
+        ContentSafetyMetrics.SELF_HARM,
+        ContentSafetyMetrics.SEXUAL
     ]
 
 
@@ -69,10 +76,10 @@ class ChatMetrics:
         EvaluationMetrics.GPT_GROUNDEDNESS,
         EvaluationMetrics.GPT_RELEVANCE,
         EvaluationMetrics.GPT_RETRIEVAL_SCORE,
-        EvaluationMetrics.HATE_FAIRNESS,
-        EvaluationMetrics.VIOLENCE,
-        EvaluationMetrics.SELF_HARM,
-        EvaluationMetrics.SEXUAL
+        ContentSafetyMetrics.HATE_FAIRNESS,
+        ContentSafetyMetrics.VIOLENCE,
+        ContentSafetyMetrics.SELF_HARM,
+        ContentSafetyMetrics.SEXUAL
     ]
 
 
@@ -85,3 +92,10 @@ class ChatMetrics:
     QA: QaMetrics,
     CHAT: ChatMetrics
 }
+
+CONTENT_SAFETY_METRICS_LIST = [
+    ContentSafetyMetrics.SEXUAL,
+    ContentSafetyMetrics.SELF_HARM,
+    ContentSafetyMetrics.VIOLENCE,
+    ContentSafetyMetrics.HATE_FAIRNESS
+]
diff --git a/sdk/ai/azure-ai-generative/azure/ai/generative/evaluate/_metric_handler.py b/sdk/ai/azure-ai-generative/azure/ai/generative/evaluate/_metric_handler.py
@@ -8,7 +8,7 @@
 
 import pandas as pd
 
-from azure.ai.generative.evaluate._constants import TASK_TYPE_TO_METRICS_MAPPING, CHAT
+from azure.ai.generative.evaluate._constants import TASK_TYPE_TO_METRICS_MAPPING, CHAT, CONTENT_SAFETY_METRICS_LIST
 from ._user_agent import USER_AGENT
 
 from ._utils import run_pf_flow_with_dict_list, df_to_dict_list, wait_for_pf_run_to_complete
@@ -23,14 +23,14 @@
 
 class MetricHandler(object):
     def __init__(
-        self,
-        task_type,
-        prediction_data: pd.DataFrame,
-        input_output_data: pd.DataFrame,
-        test_data,
-        metrics_mapping=None,
-        metrics=None,
-        data_mapping: Optional[Dict] = None,
+            self,
+            task_type,
+            prediction_data: pd.DataFrame,
+            input_output_data: pd.DataFrame,
+            test_data,
+            metrics_mapping=None,
+            metrics=None,
+            data_mapping: Optional[Dict] = None,
     ):
         self.task_type = task_type
         self.prediction_data = prediction_data
@@ -72,7 +72,20 @@ def calculate_metrics(self) -> Dict:
 
         pf_client = PFClient(user_agent=USER_AGENT)
 
-        openai_config = self.metrics_mapping["openai_params"]
+        openai_config = self.metrics_mapping.get("openai_params")
+
+        if openai_config is None:
+            if all(m in CONTENT_SAFETY_METRICS_LIST for m in metrics):
+                openai_config = {
+                    "api_key": "api_key",
+                    "api_base": "api_base",
+                    "api_version": "api_version",
+                    "api_type": "azure",
+                    "deployment_id" : "deployment_id"
+                }
+            else:
+                raise Exception("model_config is required for metrics other than content safety metrics")
+
         conn_name = "openai_connection"
         deployment_id = openai_config["deployment_id"]
         if not openai_config["api_type"] or openai_config["api_type"] == "azure":
@@ -98,7 +111,8 @@ def calculate_metrics(self) -> Dict:
         nodes_list = NODE_LIST_BY_TASK[self.task_type]
 
         if self.task_type == CHAT:
-            pf_run = run_pf_flow_with_dict_list(flow_path, dict_list)
+            pf_run = run_pf_flow_with_dict_list(flow_path, dict_list, flow_params={
+                "connections": {node: {"connection": conn_name}} for node in nodes_list})
         else:
             pf_run = run_pf_flow_with_dict_list(
                 flow_path, dict_list, flow_params={"connections": {node: connection_override for node in nodes_list}}

diff --git a/sdk/ai/azure-ai-generative/tests/conftest.py b/sdk/ai/azure-ai-generative/tests/conftest.py
@@ -137,6 +137,8 @@ def sanitized_environment_variables(
             "AI_SUBSCRIPTION_ID": "00000000-0000-0000-0000-000000000",
             "AI_RESOURCE_GROUP": "00000",
             "AI_WORKSPACE_NAME": "00000",
+            "AI_PROJECT_NAME": "00000",
+            "AI_TEAM_NAME": "00000",
             "AI_FEATURE_STORE_NAME": "00000",
             "AI_TEST_STORAGE_ACCOUNT_NAME": "teststorageaccount",
             "AI_TEST_STORAGE_ACCOUNT_PRIMARY_KEY": fake_datastore_key,

diff --git a/sdk/ai/azure-ai-generative/tests/evaluate/e2etests/test_evaluate_e2e.py b/sdk/ai/azure-ai-generative/tests/evaluate/e2etests/test_evaluate_e2e.py
@@ -65,6 +65,75 @@ def test_evaluate_built_in_metrics(self, e2e_openai_api_base, e2e_openai_api_key
             assert tabular_result["gpt_groundedness"][0] in [1,2]
             assert tabular_result["gpt_groundedness"][1] in [5, 4]
 
+    def test_evaluate_built_in_metrics_missing_model_config(self, tmpdir):
+        test_data = [
+            {"context": "Some are reported as not having been wanted at all.",
+             "question": "",
+             "answer": "All are reported as being completely and fully wanted."
+            },
+            {"question": "How do you log a model?",
+             "context": "There are a few ways to log models in Azure Machine Learning. \n\nOne way is to use the `register_model()` method of the `Run` object. The `register_model()` method logs a model file in the Azure Machine Learning service workspace and makes it available for deployment. Here's an example:\n\n```python\nfrom azureml.core import Model\n\nmodel_path = '.\/outputs\/my_model.pkl'\nmodel = Model.register(workspace=ws, model_path=model_path, model_name='my_model')\n```\n\nThis code registers the model file located at `model_path` to the Azure Machine Learning service workspace with the name `my_model`. \n\nAnother way to log a model is to save it as an output of a `Run`. If your model generation code is part of a script or Jupyter notebook that runs as an Azure Machine Learning experiment, you can save the model file as an output of the `Run` object. Here's an example:\n\n```python\nfrom sklearn.linear_model import LogisticRegression\nfrom azureml.core.run import Run\n\n# Initialize a run object\nrun = Run.get_context()\n\n# Train your model\nX_train, y_train = ...\nclf = LogisticRegression().fit(X_train, y_train)\n\n# Save the model to the Run object's outputs directory\nmodel_path = 'outputs\/model.pkl'\njoblib.dump(value=clf, filename=model_path)\n\n# Log the model as a run artifact\nrun.upload_file(name=model_path, path_or_stream=model_path)\n```\n\nIn this code, `Run.get_context()` retrieves the current run context object, which you can use to track metadata and metrics for the run. After training your model, you can use `joblib.dump()` to save the model to a file, and then log the file as an artifact of the run using `run.upload_file()`.",
+             "answer": "There are a few ways to log models in Azure Machine Learning. \n\nOne way is to use the `register_model()` method of the `Run` object. The `register_model()` method logs a model file in the Azure Machine Learning service workspace and makes it available for deployment. Here's an example:\n\n```python\nfrom azureml.core import Model\n\nmodel_path = '.\/outputs\/my_model.pkl'\nmodel = Model.register(workspace=ws, model_path=model_path, model_name='my_model')\n```\n\nThis code registers the model file located at `model_path` to the Azure Machine Learning service workspace with the name `my_model`. \n\nAnother way to log a model is to save it as an output of a `Run`. If your model generation code is part of a script or Jupyter notebook that runs as an Azure Machine Learning experiment, you can save the model file as an output of the `Run` object. Here's an example:\n\n```python\nfrom sklearn.linear_model import LogisticRegression\nfrom azureml.core.run import Run\n\n# Initialize a run object\nrun = Run.get_context()\n\n# Train your model\nX_train, y_train = ...\nclf = LogisticRegression().fit(X_train, y_train)\n\n# Save the model to the Run object's outputs directory\nmodel_path = 'outputs\/model.pkl'\njoblib.dump(value=clf, filename=model_path)\n\n# Log the model as a run artifact\nrun.upload_file(name=model_path, path_or_stream=model_path)\n```\n\nIn this code, `Run.get_context()` retrieves the current run context object, which you can use to track metadata and metrics for the run. After training your model, you can use `joblib.dump()` to save the model to a file, and then log the file as an artifact of the run using `run.upload_file()`."
+            },
+        ]
+
+        with tmpdir.as_cwd():
+            output_path = tmpdir + "/evaluation_output"
+
+            with pytest.raises(Exception) as ex:
+                result = evaluate(  # This will log metric/artifacts using mlflow
+                    evaluation_name="rag-chat-1",
+                    data=test_data,
+                    task_type="qa",
+                    metrics_list=["gpt_groundedness"],
+                    data_mapping={
+                        "question": "question",
+                        "context": "context",
+                        "y_pred": "answer",
+                        "y_test": "truth",
+                    },
+                    output_path=output_path
+                )
+
+    def test_evaluate_built_in_content_safety_metrics(self, ai_client, tmpdir):
+        test_data = [
+            {"context": "Some are reported as not having been wanted at all.",
+             "question": "How is weather in Seattle?",
+             "answer": "All are reported as being completely and fully wanted."
+            },
+            {"question": "How do you log a model?",
+             "context": "There are a few ways to log models in Azure Machine Learning. \n\nOne way is to use the `register_model()` method of the `Run` object. The `register_model()` method logs a model file in the Azure Machine Learning service workspace and makes it available for deployment. Here's an example:\n\n```python\nfrom azureml.core import Model\n\nmodel_path = '.\/outputs\/my_model.pkl'\nmodel = Model.register(workspace=ws, model_path=model_path, model_name='my_model')\n```\n\nThis code registers the model file located at `model_path` to the Azure Machine Learning service workspace with the name `my_model`. \n\nAnother way to log a model is to save it as an output of a `Run`. If your model generation code is part of a script or Jupyter notebook that runs as an Azure Machine Learning experiment, you can save the model file as an output of the `Run` object. Here's an example:\n\n```python\nfrom sklearn.linear_model import LogisticRegression\nfrom azureml.core.run import Run\n\n# Initialize a run object\nrun = Run.get_context()\n\n# Train your model\nX_train, y_train = ...\nclf = LogisticRegression().fit(X_train, y_train)\n\n# Save the model to the Run object's outputs directory\nmodel_path = 'outputs\/model.pkl'\njoblib.dump(value=clf, filename=model_path)\n\n# Log the model as a run artifact\nrun.upload_file(name=model_path, path_or_stream=model_path)\n```\n\nIn this code, `Run.get_context()` retrieves the current run context object, which you can use to track metadata and metrics for the run. After training your model, you can use `joblib.dump()` to save the model to a file, and then log the file as an artifact of the run using `run.upload_file()`.",
+             "answer": "There are a few ways to log models in Azure Machine Learning. \n\nOne way is to use the `register_model()` method of the `Run` object. The `register_model()` method logs a model file in the Azure Machine Learning service workspace and makes it available for deployment. Here's an example:\n\n```python\nfrom azureml.core import Model\n\nmodel_path = '.\/outputs\/my_model.pkl'\nmodel = Model.register(workspace=ws, model_path=model_path, model_name='my_model')\n```\n\nThis code registers the model file located at `model_path` to the Azure Machine Learning service workspace with the name `my_model`. \n\nAnother way to log a model is to save it as an output of a `Run`. If your model generation code is part of a script or Jupyter notebook that runs as an Azure Machine Learning experiment, you can save the model file as an output of the `Run` object. Here's an example:\n\n```python\nfrom sklearn.linear_model import LogisticRegression\nfrom azureml.core.run import Run\n\n# Initialize a run object\nrun = Run.get_context()\n\n# Train your model\nX_train, y_train = ...\nclf = LogisticRegression().fit(X_train, y_train)\n\n# Save the model to the Run object's outputs directory\nmodel_path = 'outputs\/model.pkl'\njoblib.dump(value=clf, filename=model_path)\n\n# Log the model as a run artifact\nrun.upload_file(name=model_path, path_or_stream=model_path)\n```\n\nIn this code, `Run.get_context()` retrieves the current run context object, which you can use to track metadata and metrics for the run. After training your model, you can use `joblib.dump()` to save the model to a file, and then log the file as an artifact of the run using `run.upload_file()`."
+            },
+        ]
+
+        with tmpdir.as_cwd():
+            output_path = tmpdir + "/evaluation_output"
+            tracking_uri = ai_client.tracking_uri
+
+            result = evaluate(  # This will log metric/artifacts using mlflow
+                evaluation_name="rag-chat-1",
+                data=test_data,
+                task_type="qa",
+                metrics_list=["violence"],
+                data_mapping={
+                    "question": "question",
+                    "context": "context",
+                    "y_pred": "answer",
+                    "y_test": "truth",
+                },
+                output_path=output_path,
+                tracking_uri=tracking_uri
+            )
+
+            metrics_summary = result.metrics_summary
+            tabular_result = pd.read_json(os.path.join(output_path, "eval_results.jsonl"), lines=True)
+
+            assert "violence_defect_rate" in metrics_summary.keys()
+            assert tabular_result["violence"][0] in ["Safe"]
+            assert tabular_result["violence_score"][0] in [0.0]
+            assert tabular_result["violence_reasoning"] is not None
+
 
     def test_duplicate_metrics_name(self, e2e_openai_api_base, e2e_openai_api_key, e2e_openai_completion_deployment_name, tmpdir):
         test_data = [