Merge pull request #38 from code-dot-org/ceara/AITT-408-accuracy-threshold

cearachew · web-flow · commit 3a8e24f4b44a · 2024-01-24T16:42:18.000-05:00
Ceara/aitt 408 accuracy threshold
diff --git a/TESTING.md b/TESTING.md
@@ -2,10 +2,11 @@
 
 ## Unit Tests
 
-The `./tests` directory contains two categories of test:
+The `./tests` directory contains three categories of test:
 
 * `unit`: Unit tests for library functions in the `./lib` path.
 * `routes`: Tests routes and their helpers in the `./src` as a unit.
+* `accuracy`: Tests accuracy against thresholds by calling OpenAI. Not run by default.
 
 All tests are using [pytest](https://docs.pytest.org/en/7.4.x/).
 
@@ -26,6 +27,21 @@ just run `pytest` within a running container's shell session by using the
 PYTHONPATH=/app pytest
 ```
 
+## Accuracy Tests
+
+**Running the Accuracy test hits the OpenAI endpoint and is expensive! Only run this test infrequently**
+
+To run the accuracy threshold test, follow directions in `README.md` to set up your local
+environment for running the Rubric Tester. You can then run `./bin/test_accuracy.sh` to run
+tests locally, including the accuracy threshold test.
+
+You can pass any arguments to pytest with this script. For instance, the `-k` argument can filter tests by name:
+
+```
+# Run only tests with 'accuracy' in the name:
+./bin/test_accuracy.sh -k accuracy
+```
+
 ## Scripted
 
 This assumes you have built and are running the container as depicted in the main `README.md`.
diff --git a/bin/test_accuracy.sh b/bin/test_accuracy.sh
@@ -0,0 +1,6 @@
+#!/bin/bash
+
+set -eu
+
+echo "Running: \`coverage run -m pytest --accuracy $@ && coverage report -m\`"
+coverage run -m pytest --accuracy $@ && coverage report -m
diff --git a/lib/assessment/config.py b/lib/assessment/config.py
@@ -3,11 +3,11 @@
 SUPPORTED_MODELS = ['gpt-4-0314', 'gpt-4-32k-0314', 'gpt-4-0613', 'gpt-4-32k-0613', 'gpt-4-1106-preview']
 DEFAULT_MODEL = 'gpt-4-0613'
 LESSONS = {
-    "U3-2022-L10" : "1ROCbvHb3yWGVoQqzKAjwdaF0dSRPUjy_",
-    "U3-2022-L13" : "1kGHeY5LRpFJ9xVRoBEWbyOJyKm4wClqw",
+    # "U3-2022-L10" : "1ROCbvHb3yWGVoQqzKAjwdaF0dSRPUjy_",
+    # "U3-2022-L13" : "1kGHeY5LRpFJ9xVRoBEWbyOJyKm4wClqw",
     "U3-2022-L17" : "1WirJLIFgo-anxAz-kZXDVQ2Tl_8OuX22",
     "U3-2022-L20" : "115BHvZ1kJC2xhUSOBkLiE8DC1YgcjyRd",
-    "U3-2022-L23" : "12OJex4l9OhWrnbLenpvZAibtfiFWWdzx",
+    # "U3-2022-L23" : "12OJex4l9OhWrnbLenpvZAibtfiFWWdzx",
     "New-U3-2022-L10" : "15xAUFVeGkXeG18mDWBOKN6yJPpI185tg",
     "New-U3-2022-L13" : "14LI9eRRgxL5rRQK6FoUI0ow_YIb5V0mg",
 }
diff --git a/lib/assessment/rubric_tester.py b/lib/assessment/rubric_tester.py
@@ -11,6 +11,7 @@
 import io
 import logging
 import gdown
+import pprint
 
 from sklearn.metrics import accuracy_score, confusion_matrix
 from collections import defaultdict
@@ -27,6 +28,10 @@
 output_dir_name = 'output'
 base_dir = 'lesson_data'
 cache_dir_name = 'cached_responses'
+accuracy_threshold_file = 'accuracy_thresholds.json'
+accuracy_threshold_dir = 'tests/data'
+
+pp = pprint.PrettyPrinter(indent=2)
 
 def command_line_options():
     parser = argparse.ArgumentParser(description='Usage')
@@ -51,6 +56,8 @@ def command_line_options():
                         help='Temperature of the LLM. Defaults to 0.0.')
     parser.add_argument('-d', '--download', action='store_true',
                         help='re-download lesson files, overwriting previous files')
+    parser.add_argument('-a', '--accuracy', action='store_true',
+                        help='Run against accuracy thresholds')
 
     args = parser.parse_args()
 
@@ -107,6 +114,13 @@ def get_actual_labels(actual_labels_file, prefix):
             actual_labels[student_id] = dict(row)
     return actual_labels
 
+def get_accuracy_thresholds(accuracy_threshold_file=accuracy_threshold_file, prefix=accuracy_threshold_dir):
+    thresholds = None
+    if os.path.exists(os.path.join(prefix, accuracy_threshold_file)):
+        with open(os.path.join(prefix, accuracy_threshold_file), 'r') as f:
+            thresholds = json.load(f)
+    return thresholds
+
 
 def get_examples(prefix):
     example_js_files = sorted(glob.glob(os.path.join(prefix, 'examples', '*.js')))
@@ -167,11 +181,11 @@ def compute_accuracy(actual_labels, predicted_labels, passing_labels):
         actual = actual_by_criteria[criteria]
         
         confusion_by_criteria[criteria] = confusion_matrix(actual, predicted, labels=label_names)
-        accuracy_by_criteria[criteria] = accuracy_score(actual, predicted) * 100
+        accuracy_by_criteria[criteria] = accuracy_score(actual, predicted)
         overall_predicted.extend(predicted)
         overall_actual.extend(actual)
 
-    overall_accuracy = accuracy_score(overall_actual, overall_predicted) * 100
+    overall_accuracy = accuracy_score(overall_actual, overall_predicted)
     overall_confusion = confusion_matrix(overall_actual, overall_predicted, labels=label_names)
 
     return accuracy_by_criteria, overall_accuracy, confusion_by_criteria, overall_confusion, label_names
@@ -205,13 +219,25 @@ def main():
     command_line = " ".join(os.sys.argv)
     options = command_line_options()
     main_start_time = time.time()
+    accuracy_failures = {}
+    accuracy_pass = True
+    accuracy_thresholds = None
+
+    print(options)
+
+    if options.accuracy:
+        accuracy_thresholds = get_accuracy_thresholds()
 
     for lesson in options.lesson_names:
         prefix = os.path.join(base_dir, lesson)
 
         # download lesson files
         if not os.path.exists(prefix) or options.download:
-            gdown.download_folder(id=LESSONS[lesson], output=prefix)
+            try:
+                gdown.download_folder(id=LESSONS[lesson], output=prefix)
+            except Exception as e:
+                print(f"Could not download lesson {lesson}")
+                logging.error(e)
 
         # read in lesson files, validate them
         prompt, standard_rubric = read_inputs(prompt_file, standard_rubric_file, prefix)
@@ -244,16 +270,18 @@ def main():
 
         # calculate accuracy and generate report
         accuracy_by_criteria, overall_accuracy, confusion_by_criteria, overall_confusion, label_names = compute_accuracy(actual_labels, predicted_labels, options.passing_labels)
+        overall_accuracy_percent = overall_accuracy * 100
+        accuracy_by_criteria_percent = {k:v*100 for k,v in accuracy_by_criteria.items()}
         report = Report()
         report.generate_html_output(
             output_file,
             prompt,
             rubric,
-            accuracy=overall_accuracy,
+            accuracy=overall_accuracy_percent,
             predicted_labels=predicted_labels,
             actual_labels=actual_labels,
             passing_labels=options.passing_labels,
-            accuracy_by_criteria=accuracy_by_criteria,
+            accuracy_by_criteria=accuracy_by_criteria_percent,
             errors=errors,
             command_line=command_line,
             confusion_by_criteria=confusion_by_criteria,
@@ -263,8 +291,30 @@ def main():
         )
         logging.info(f"main finished in {int(time.time() - main_start_time)} seconds")
 
+        if options.accuracy and accuracy_thresholds is not None:
+            if overall_accuracy < accuracy_thresholds[lesson]['overall']:
+                accuracy_pass = False
+                accuracy_failures[lesson] = {}
+                accuracy_failures[lesson]['overall'] = {}
+                accuracy_failures[lesson]['overall']['accuracy_score'] = overall_accuracy
+                accuracy_failures[lesson]['overall']['threshold'] = accuracy_thresholds[lesson]['overall']
+            for key_concept in accuracy_by_criteria:
+                if accuracy_by_criteria[key_concept] < accuracy_thresholds[lesson]['key_concepts'][key_concept]:
+                    accuracy_pass = False
+                    if lesson not in accuracy_failures.keys(): accuracy_failures[lesson] = {}
+                    if 'key_concepts' not in accuracy_failures[lesson].keys(): accuracy_failures[lesson]['key_concepts'] = {}
+                    if key_concept not in accuracy_failures[lesson]['key_concepts'].keys() : accuracy_failures[lesson]['key_concepts'][key_concept] = {}
+                    accuracy_failures[lesson]['key_concepts'][key_concept]['accuracy_score'] = accuracy_by_criteria[key_concept]
+                    accuracy_failures[lesson]['key_concepts'][key_concept]['threshold'] = accuracy_thresholds[lesson]['key_concepts'][key_concept]
+
         os.system(f"open {output_file}")
 
+    if not accuracy_pass and len(accuracy_failures.keys()) > 0:
+        logging.error(f"The following thresholds were not met:\n{pp.pformat(accuracy_failures)}")
+        print(("PASS" if accuracy_pass else "FAIL"))
+
+    return accuracy_pass
+
 
 def init():
     if __name__ == '__main__':
diff --git a/tests/accuracy/conftest.py b/tests/accuracy/conftest.py
diff --git a/tests/accuracy/test_accuracy.py b/tests/accuracy/test_accuracy.py
@@ -0,0 +1,19 @@
+import pytest
+import os
+
+from unittest import mock
+
+from lib.assessment.rubric_tester import (
+    main,
+)
+
+accuracy = pytest.mark.skipif("not config.getoption('accuracy')")
+
+@accuracy
+@pytest.mark.accuracy_setup
+class TestAccuracy:
+    def test_accuracy(self):
+        assert "OPENAI_API_KEY" in os.environ
+        with mock.patch('sys.argv', ['rubric_tester.py', '-a']):
+            ret = main()
+        assert ret == True
diff --git a/tests/conftest.py b/tests/conftest.py
@@ -9,7 +9,15 @@
 import contextlib
 import os
 
-
+def pytest_addoption(parser):
+    parser.addoption('--accuracy', action='store_true', dest="accuracy",
+                 default=False, help="enable accuracy tests that run openai")
+    
+def pytest_configure(config):
+    config.addinivalue_line(
+        "markers", "accuracy_setup"
+    )
+    
 @pytest.fixture()
 def app():
     app = create_app()
@@ -37,17 +45,18 @@ def configured_app():
 
     # clean up / reset resources here
 
-
 @pytest.fixture(autouse=True)
-def mock_env_vars():
+def mock_env_vars(request):
     """ Ensures env vars are not touched by tests.
     """
-
-    from unittest.mock import patch
-
-    # Ensure the os.environ passes out a new dictionary
-    with patch.dict(os.environ, {}, clear=True):
+    if 'accuracy_setup' in request.keywords:
         yield
+    else:
+        from unittest.mock import patch
+        print("no env vars")
+        # Ensure the os.environ passes out a new dictionary
+        with patch.dict(os.environ, {}, clear=True):
+            yield
 
 
 @pytest.fixture()
diff --git a/tests/data/accuracy_thresholds.json b/tests/data/accuracy_thresholds.json
@@ -0,0 +1,35 @@
+{
+    "U3-2022-L17": {
+        "overall": 0.7,
+        "key_concepts": {
+            "Algorithms and Control - Conditionals": 0.7,
+            "Algorithms and Control - User Input": 0.7,
+            "Modularity - Multiple Sprites": 0.7,
+            "Position and Movement": 0.7
+        }
+    },
+    "U3-2022-L20": {
+        "overall": 0.7,
+        "key_concepts": {
+            "Algorithms and Control Structures": 0.7,
+            "Program Development 2": 0.7,
+            "Variables": 0.7
+        }
+    },
+    "New-U3-2022-L10": {
+        "overall": 0.7,
+        "key_concepts": {
+            "Modularity - Sprites and Sprite Properties": 0.7,
+            "Position - Elements and the Coordinate System": 0.7,
+            "Program Development - Program Sequence": 0.7
+        }
+    },
+    "New-U3-2022-L13": {
+        "overall": 0.7,
+        "key_concepts": {
+            "Modularity - Sprites and Sprite Properties": 0.7,
+            "Optional \u201cStretch\u201d Feature - Variables": 0.7,
+            "Position and Movement": 0.7
+        }
+    }
+}