lambda-feedback · charlieblindsay · Aug 15, 2024
diff --git a/app/data/RiskAssessment.py b/app/data/RiskAssessment.py
@@ -1,3 +1,5 @@
+# Class used to create Risk Assessment examples with methods to create LLM prompts specific to the risk assessment example from LLM prompt templates 
+
 from typing import Type
 
 from ..utils.LLMCaller import *

diff --git a/app/data/example_risk_assessments.py b/app/data/example_risk_assessments.py
@@ -1,6 +1,4 @@
-# Learnings:
-# 1. Keeping a safe distance away from a possible projectile is a prevention measure.
-# The hazard event is therefore the projectile hitting someone, not the projectile being released.
+# Risk Assessments used to test the accuracy of LLM prompts
 
 import numpy as np
 

diff --git a/app/evaluation.py b/app/evaluation.py
@@ -26,7 +26,6 @@ class Params(TypedDict):
     is_feedback_text: bool
     is_risk_matrix: bool
     is_risk_assessment: bool
-    are_all_input_fields_entered_manually: bool
     LLM: str
 
 def provide_feedback_on_risk_matrix(response):
@@ -237,7 +236,6 @@ def evaluation_function(response: Any, answer: Any, params: Params) -> Result:
         LLM_name = params["LLM"]
         LLM = LLM_dictionary[LLM_name]
 
-    if params['are_all_input_fields_entered_manually'] == True:
         activity, hazard, how_it_harms, who_it_harms, uncontrolled_likelihood, uncontrolled_severity, uncontrolled_risk, prevention, mitigation, controlled_likelihood, controlled_severity, controlled_risk = np.array(response).flatten()
 
         RA = RiskAssessment(activity=activity, hazard=hazard, who_it_harms=who_it_harms, how_it_harms=how_it_harms,
@@ -393,115 +391,4 @@ def evaluation_function(response: Any, answer: Any, params: Params) -> Result:
         {feedback_for_correct_answers} \n\n\n\n\n
         {no_information_provided_message}'''
 
-        return Result(is_correct=is_everything_correct, feedback=feedback)
-
-    if params['are_all_input_fields_entered_manually'] == False:
-
-        prevention, mitigation = np.array(response).flatten()
-
-        activity = 'Heat transfer lab'
-        hazard = 'Boiling hot water'
-        who_it_harms = 'Students'
-        how_it_harms = 'Burns'
-
-        hazard_event = 'Boiling hot water split on student'
-        harm_caused = 'Burns'
-
-        RA = RiskAssessment(activity=activity, hazard=hazard, who_it_harms=who_it_harms, how_it_harms=how_it_harms,
-                            uncontrolled_likelihood=1, uncontrolled_severity=1,
-                            uncontrolled_risk=1, prevention=prevention, mitigation=mitigation,
-                            controlled_likelihood=1, controlled_severity=1, controlled_risk=1,
-                            prevention_prompt_expected_class='prevention', mitigation_prompt_expected_class='mitigation', risk_domain='')
-
-        input_check_feedback_message = RA.get_input_check_feedback_message()
-
-        if input_check_feedback_message != True:
-            return Result(is_correct=False,
-                        feedback=f'''\n\n\n\n\n # Feedback:\n\n\n\n\n
-                                    \n\n\n\n\n## {input_check_feedback_message}\n\n\n\n\n''')
-
-        feedback_for_incorrect_answers = '\n\n\n\n# Feedback for Incorrect Answers\n\n\n\n'
-        feedback_for_correct_answers = '\n\n\n\n# Feedback for Correct Answers\n\n\n\n'
-
-        fields_for_which_no_information_provided = []
-
-        is_everything_correct = True
-
-        # PREVENTION CHECKS
-        no_information_provided_for_prevention_prompt_input = RA.get_no_information_provided_for_prevention_input()
-        no_information_provided_for_prevention_prompt_output, no_information_provided_for_prevention_pattern = RA.get_prompt_output_and_pattern_matched(prompt_input_object=no_information_provided_for_prevention_prompt_input, LLM_caller=LLM)
-
-        if no_information_provided_for_prevention_pattern == 'no information provided' or RA.prevention == '':
-            fields_for_which_no_information_provided.append('Prevention')
-
-        else:
-
-            control_measure_prompt_with_prevention_input = RA.get_control_measure_prompt_with_prevention_input()
-            control_measure_prompt_with_prevention_output, control_measure_prompt_with_prevention_pattern = RA.get_prompt_output_and_pattern_matched(prompt_input_object=control_measure_prompt_with_prevention_input, 
-                                                                                                                                                     LLM_caller=LLM,
-                                                                                                                                                     harm_caused=harm_caused, 
-                                                                                                                                                     hazard_event=hazard_event)
-
-            feedback_for_correct_answers, feedback_for_incorrect_answers, is_everything_correct = provide_feedback_on_control_measure_input(
-                control_measure_input_field='prevention',
-                control_measure_prompt_input=control_measure_prompt_with_prevention_input,
-                control_measure_prompt_output=control_measure_prompt_with_prevention_output,
-                control_measure_prompt_pattern=control_measure_prompt_with_prevention_pattern,
-                feedback_for_correct_answers=feedback_for_correct_answers,
-                feedback_for_incorrect_answers=feedback_for_incorrect_answers,
-                is_everything_correct=is_everything_correct,
-                risk_assessment=RA,
-                LLM_caller=LLM
-            )
-
-        # MITIGATION CHECKS
-        no_information_provided_for_mitigation_prompt_input = RA.get_no_information_provided_for_mitigation_input()
-        no_information_provided_for_mitigation_prompt_output, no_information_provided_for_mitigation_pattern = RA.get_prompt_output_and_pattern_matched(prompt_input_object=no_information_provided_for_mitigation_prompt_input, LLM_caller=LLM)
-
-        if no_information_provided_for_mitigation_pattern == 'no information provided' or RA.mitigation == '':
-            fields_for_which_no_information_provided.append('Mitigation')
-        else:
-
-            control_measure_prompt_with_mitigation_input = RA.get_control_measure_prompt_with_mitigation_input()
-            control_measure_prompt_with_mitigation_output, control_measure_prompt_with_mitigation_pattern = RA.get_prompt_output_and_pattern_matched(prompt_input_object=control_measure_prompt_with_mitigation_input, 
-                                                                                                                                                     LLM_caller=LLM, 
-                                                                                                                                                     harm_caused=harm_caused, 
-                                                                                                                                                     hazard_event=hazard_event)
-
-            feedback_for_correct_answers, feedback_for_incorrect_answers, is_everything_correct = provide_feedback_on_control_measure_input(
-                control_measure_input_field='mitigation',
-                control_measure_prompt_input=control_measure_prompt_with_mitigation_input,
-                control_measure_prompt_output=control_measure_prompt_with_mitigation_output,
-                control_measure_prompt_pattern=control_measure_prompt_with_mitigation_pattern,
-                feedback_for_correct_answers=feedback_for_correct_answers,
-                feedback_for_incorrect_answers=feedback_for_incorrect_answers,
-                is_everything_correct=is_everything_correct,
-                risk_assessment=RA,
-                LLM_caller=LLM
-            )
-
-        if is_everything_correct == True:
-            feedback_for_incorrect_answers = '# Congratulations! All your answers are correct!'
-
-        if fields_for_which_no_information_provided == []:
-            no_information_provided_message = ''
-        else:
-            no_information_provided_message = f'\n\n\n\n\n## Fields for which no information is provided and hence no feedback given: {", ".join(fields_for_which_no_information_provided)}\n\n\n\n\n'
-
-        if fields_for_which_no_information_provided != ['Prevention', 'Mitigation']:
-            hazard_event_and_harm_caused_inferred_message = f'''## The following were inferred from your answers: \n\n\n\n\n
-            \n\n\n\n\n### Event that leads to harm: "{hazard_event}"\n\n\n\n\n
-            \n\n\n\n\n### Harm caused to '{RA.who_it_harms}': "{harm_caused}".\n\n\n\n
-            \n\n\n\n\n### If they are incorrect, please make these more explicit in the "Hazard" and "How it harms" fields.\n\n\n\n\n'''
-        else:
-            hazard_event_and_harm_caused_inferred_message = ''
-
-        feedback_for_correct_answers += f'''
-        \n\n\n\n### There are no errors in your likelihood, severity, and risk values.\n\n\n\n'''
-
-        feedback=f'''{hazard_event_and_harm_caused_inferred_message} \n\n\n\n\n
-        {feedback_for_incorrect_answers} \n\n\n\n\n
-        {feedback_for_correct_answers} \n\n\n\n\n
-        {no_information_provided_message}'''
-
         return Result(is_correct=is_everything_correct, feedback=feedback)
diff --git a/app/prompts/BasePromptInput.py b/app/prompts/BasePromptInput.py
@@ -1,3 +1,5 @@
+# Base class that other PromptInput classes inherit from.
+
 try:
     from utils.RegexPatternMatcher import RegexPatternMatcher
 except:

diff --git a/app/prompts/ControlMeasureClassification.py b/app/prompts/ControlMeasureClassification.py
@@ -1,3 +1,5 @@
+# PromptInput class used to classify a control measure as either a prevention, mitigation, both or neither. This prompt takes the "event that leads to harm" and the "harm caused" as input.
+
 from ..prompts.BasePromptInput import BasePromptInput
 from ..utils.RegexPatternMatcher import RegexPatternMatcher
 

diff --git a/app/prompts/HarmCausedAndHazardEvent.py b/app/prompts/HarmCausedAndHazardEvent.py
@@ -1,3 +1,5 @@
+# PromptInput class used to infer the "event that leads to harm" and the "harm caused" from the student's risk assessment inputs.
+
 from .BasePromptInput import BasePromptInput
 
 class HarmCausedAndHazardEvent(BasePromptInput):

diff --git a/app/prompts/HowItHarmsInContext.py b/app/prompts/HowItHarmsInContext.py
@@ -1,3 +1,5 @@
+# PromptInput class that checks whether the "How it harms" input matches the "activity" and "hazard" inputs.
+
 from ..prompts.BasePromptInput import BasePromptInput
 from ..utils.RegexPatternMatcher import RegexPatternMatcher
 

diff --git a/app/prompts/NoInformationProvided.py b/app/prompts/NoInformationProvided.py
@@ -1,3 +1,5 @@
+# PromptInput class that checks whether no information is provided in the "prevention" or "mitigation" input fields.
+
 from .BasePromptInput import BasePromptInput
 
 class NoInformationProvided(BasePromptInput):

diff --git a/app/prompts/SummarizeControlMeasureFeedback.py b/app/prompts/SummarizeControlMeasureFeedback.py
@@ -1,3 +1,5 @@
+# PromptInput class that takes in the output from the ControlMeasureClassification prompt and shortens it to 3 sentences.
+
 from ..prompts.BasePromptInput import BasePromptInput
 
 class SummarizeControlMeasureFeedback(BasePromptInput):

diff --git a/app/prompts/WhoItHarmsInContext.py b/app/prompts/WhoItHarmsInContext.py
@@ -1,3 +1,5 @@
+# PromptInput class that checks whether the "Who it harms" input matches the "activity", "hazard" and "how it harms" inputs.
+
 from ..prompts.BasePromptInput import BasePromptInput
 from ..utils.RegexPatternMatcher import RegexPatternMatcher
 

diff --git a/app/test_classes/BaseTestClass.py b/app/test_classes/BaseTestClass.py
@@ -1,3 +1,5 @@
+# Base class used to test the accuracy of different prompts
+
 from ..utils.LLMCaller import LLMCaller
 from ..utils.RegexPatternMatcher import RegexPatternMatcher
 

diff --git a/app/test_classes/TestBothPreventionAndMitigationInput.py b/app/test_classes/TestBothPreventionAndMitigationInput.py
@@ -1,3 +1,5 @@
+# Builds on TestControlMeasureClassificationPrompt.py to test % of times both prevention and mitigation correctly classified.
+
 from ..test_classes.TestControlMeasureClassificationPrompt import TestControlMeasureClassificationPrompt
 from ..utils.LLMCaller import LLMCaller
 import numpy as np

diff --git a/app/test_classes/TestControlMeasureClassificationPrompt.py b/app/test_classes/TestControlMeasureClassificationPrompt.py
@@ -1,3 +1,5 @@
+# Builds on TestModelAccuracyForCombinationOfPrompts.py to test accuracy of control measure classification.
+
 from ..test_classes.TestModelAccuracyForCombinationOfPrompts import TestModelAccuracyForCombinationOfPrompts
 from ..utils.LLMCaller import LLMCaller
 

diff --git a/app/test_classes/TestModelAccuracy.py b/app/test_classes/TestModelAccuracy.py
@@ -1,3 +1,5 @@
+# Builds on BaseTestClass to allow testing of multiple risk assessment examples (from data/example_risk_assessments.py)
+
 from ..test_classes.BaseTestClass import BaseTestClass
 from ..utils.LLMCaller import LLMCaller
 import pandas as pd

diff --git a/app/test_classes/TestModelAccuracyForCombinationOfPrompts.py b/app/test_classes/TestModelAccuracyForCombinationOfPrompts.py
@@ -1,3 +1,5 @@
+# Builds on TestModelAccuracy class to test the accuracy of multiple prompts used in sequence, e.g. the HarmCausedAndHazardEvent and ControlMeasureClassification prompts.
+
 from ..test_classes.TestModelAccuracy import TestModelAccuracy
 from ..utils.LLMCaller import LLMCaller
 

diff --git a/app/test_classes/TestPromptOnSingleExample.py b/app/test_classes/TestPromptOnSingleExample.py
@@ -1,3 +1,5 @@
+# Builds on BaseTestClass.py to test prompt on single risk assessment example (this is used in unit tests)
+
 from BaseTestClass import BaseTestClass
 from ..utils.LLMCaller import LLMCaller
 

diff --git a/app/test_scripts/risk_domain_test_for_how_it_harms_prompt.py b/app/test_scripts/risk_domain_test_for_how_it_harms_prompt.py
@@ -1,4 +1,5 @@
-# python -m app.test_scripts.risk_domain_test_for_how_it_harms_prompt
+# Script that tests whether the "how it harms" input is from the same risk domain as the "activity" and "hazard" inputs.
+# To run, enter in terminal: python -m app.test_scripts.risk_domain_test_for_how_it_harms_prompt
 
 from ..test_classes.TestModelAccuracy import TestModelAccuracy
 from ..test_utils.ExamplesGenerator import ExamplesGeneratorFromCorrectExamples

diff --git a/app/test_scripts/risk_domain_test_for_who_it_harms_prompt.py b/app/test_scripts/risk_domain_test_for_who_it_harms_prompt.py
@@ -1,4 +1,6 @@
-# python -m app.test_scripts.risk_domain_test_for_who_it_harms_prompt
+# Script that tests whether the "who it harms" input is from the same risk domain as the "activity" and "hazard" inputs.
+
+# To run, enter in terminal: python -m app.test_scripts.risk_domain_test_for_who_it_harms_prompt
 
 from ..test_classes.TestModelAccuracy import TestModelAccuracy
 from ..test_utils.ExamplesGenerator import ExamplesGeneratorFromCorrectExamples

diff --git a/app/test_scripts/test_control_measure_classification_prompts.py b/app/test_scripts/test_control_measure_classification_prompts.py
@@ -1,4 +1,6 @@
-# python -m app.test_scripts.test_control_measure_classification_prompts
+# To run, enter in terminal: python -m app.test_scripts.test_control_measure_classification_prompts
+
+# Script that tests the accuracy of control measure classification prompt and performs ablation study to see impact of few-shot and chain-of-thought prompting on accuracy.
 
 from ..test_classes.TestBothPreventionAndMitigationInput import TestBothPreventionAndMitigationInput
 from ..test_classes.TestPreventionInput__ControlMeasureClassifiationPrompt import TestPreventionInput__ControlMeasureClassifiationPrompt

diff --git a/...st_scripts/test_control_measure_classification_prompts_without_context_of_other_inputs.py b/...st_scripts/test_control_measure_classification_prompts_without_context_of_other_inputs.py
diff --git a/app/test_scripts/test_latency.py b/app/test_scripts/test_latency.py
@@ -1,4 +1,6 @@
-# python -m app.test_scripts.test_latency
+# Script which tests the latency of different LLMs
+
+# To run, enter in terminal: python -m app.test_scripts.test_latency
 
 from ..utils.LLMCaller import *
 from ..evaluation import evaluation_function, Params

diff --git a/app/test_scripts/test_no_information_provided.py b/app/test_scripts/test_no_information_provided.py
@@ -1,4 +1,6 @@
-# python -m app.test_scripts.test_no_information_provided
+# Script which tests the accuracy of the NoInformationProvided.py prompt
+
+# To run, enter in terminal: python -m app.test_scripts.test_no_information_provided
 
 from ..test_utils.InputAndExpectedOutput import InputAndExpectedOutputForSinglePrompt
 from ..utils.LLMCaller import *

diff --git a/app/test_scripts/test_summarize_control_measure_feedback_prompt.py b/app/test_scripts/test_summarize_control_measure_feedback_prompt.py
@@ -1,4 +1,4 @@
-# python -m app.test_scripts.test_summarize_control_measure_feedback_prompt
+# To run, enter in terminal: python -m app.test_scripts.test_summarize_control_measure_feedback_prompt
 
 from ..utils.LLMCaller import *
 from ..test_classes.TestSummarizeControlMeasureFeedback import TestSummarizePreventionFeedback, TestSummarizeMitigationFeedback

diff --git a/app/test_utils/ExamplesGenerator.py b/app/test_utils/ExamplesGenerator.py
@@ -1,3 +1,5 @@
+# Series of classes used to create InputAndExpectedOutput objects for different tests.
+
 from .InputAndExpectedOutput import InputAndExpectedOutputForSinglePrompt, InputAndExpectedOutputForCombinedPrompts
 
 class ExamplesGenerator:

diff --git a/app/test_utils/InputAndExpectedOutput.py b/app/test_utils/InputAndExpectedOutput.py
@@ -1,3 +1,5 @@
+# Class which contains a prompt input object and the expected output for the prompt.
+
 try:
     from ..prompts.BasePromptInput import BasePromptInput
     from ..data.RiskAssessment import RiskAssessment