code-dot-org · davidsbailey · Feb 26, 2024 · Feb 13, 2024 · Feb 14, 2024 · Feb 14, 2024
diff --git a/README.md b/README.md
@@ -157,6 +157,49 @@ In order to rerun only the failed student projects, you can pass the `-c` (`--us
 
 After enough reruns, you'll have a complete accuracy measurement for the lesson. NOTE: the very high number of errors in this example is because we are using a weak model (GPT 3.5 Turbo) by default. Stronger models often complete an entire lesson without errors, but in case of errors the same principle applies to getting complete test results.
 
+### running rubric tester cheaply
+
+#### using cached responses
+
+experiments run against GPT 4, GPT 4 Turbo and other pricey models should include report html and cached response data. this allows you to quickly view reports for these datasets either by looking directly at the `output/report*html` files or by regenerating the report against cached data via a command like:
+```commandline
+python ./lib/assessment/rubric_tester.py --experiment-name ai-rubrics-pilot-baseline-gpt-4-turbo --use-cached
+```
+
+#### smaller test runs
+
+by default, rubric tester runs against ~20 projects in each of 6 lessons.
+
+while you are experimenting with expensive models in rubric tester, the easiest ways to limit the size/cost of your test runs are:
+* use the `--lesson-names` flag to run only one lesson
+* use the `-s` param, e.g. `-s 3` to run against only 3 code samples in each lesson
+
+### creating a new experiment
+
+generally speaking, new experiments will be created as follows:
+* download an existing experiment from S3
+* create a local copy
+* make local changes and measure accuracy
+* once satisfied, upload the experiment dir back to S3, including output/ and cached_responses/ directories
+
+a similar process can be followed for new releases.
+
+### regenerating example LLM responses
+
+rubric tester supports sending example user requests (js) and LLM responses (json) so that the LLM can use few-shot learning to give better results. Once you have identified js that you want to use as examples, here is how you can leverage the rubric tester to have the LLM do most of the work of generating these responses for you on your local machine:
+
+1. create a new experiment you want to add examples to
+2. craft example js and desired labels into a temporary new dataset
+    * copy your example `*.js` files into the dataset
+    * create an `actual_labels.csv` containing desired labels for each student and each learning goal
+3. use LLM to generate new json responses as a starting point
+    * temporarily modify `label.py` to log each `response_data` it receives
+    * use rubric tester to run the new experiment against the temp dataset using GPT 4 classic (e.g. `gpt-4-0613`), and record the report html and log output
+4. in your experiment, create the example responses
+    * copy your example js from the temp dataset into `examples/*.js`
+    * clean the log output and paste it into `examples/*.json`
+    * use the report html to identify any errors the LLM made, and correct any issues in the Observations, Reason, or Grade fields in the new `examples/*.json`
+
 ## Logging
 
 Logging is done via the normal Python `logging` library.

diff --git a/lib/assessment/label.py b/lib/assessment/label.py
@@ -47,7 +47,7 @@ def statically_label_student_work(self, rubric, student_code, student_id, exampl
         # We can't assess this statically
         return None
 
-    def ai_label_student_work(self, prompt, rubric, student_code, student_id, examples=[], num_responses=0, temperature=0.0, llm_model=""):
+    def ai_label_student_work(self, prompt, rubric, student_code, student_id, examples=[], num_responses=0, temperature=0.0, llm_model="", response_type='tsv'):
         # Determine the OpenAI URL and headers
         api_url = 'https://api.openai.com/v1/chat/completions'
         headers = {
@@ -74,7 +74,7 @@ def ai_label_student_work(self, prompt, rubric, student_code, student_id, exampl
 
         info = response.json()
 
-        response_data = self.response_data_from_choices(info, rubric, student_id)
+        response_data = self.response_data_from_choices(info, rubric, student_id, response_type=response_type)
 
         return {
             'metadata': {
@@ -85,7 +85,7 @@ def ai_label_student_work(self, prompt, rubric, student_code, student_id, exampl
             'data': response_data,
         }
 
-    def label_student_work(self, prompt, rubric, student_code, student_id, examples=[], use_cached=False, write_cached=False, num_responses=0, temperature=0.0, llm_model="", remove_comments=False, cache_prefix=""):
+    def label_student_work(self, prompt, rubric, student_code, student_id, examples=[], use_cached=False, write_cached=False, num_responses=0, temperature=0.0, llm_model="", remove_comments=False, response_type='tsv', cache_prefix=""):
         if use_cached and os.path.exists(os.path.join(cache_prefix, f"cached_responses/{student_id}.json")):
             with open(os.path.join(cache_prefix, f"cached_responses/{student_id}.json"), 'r') as f:
                 return json.load(f)
@@ -104,7 +104,7 @@ def label_student_work(self, prompt, rubric, student_code, student_id, examples=
         # Right now, however, only if there is no result, we try the AI for assessment
         if result is None:
             try:
-                result = self.ai_label_student_work(prompt, rubric, student_code, student_id, examples=examples, num_responses=num_responses, temperature=temperature, llm_model=llm_model)
+                result = self.ai_label_student_work(prompt, rubric, student_code, student_id, examples=examples, num_responses=num_responses, temperature=temperature, llm_model=llm_model, response_type=response_type)
             except requests.exceptions.ReadTimeout:
                 logging.error(f"{student_id} request timed out in {(time.time() - start_time):.0f} seconds.")
                 result = None
@@ -158,15 +158,15 @@ def sanitize_code(self, student_code, remove_comments=False):
 
         return student_code
 
-    def response_data_from_choices(self, info, rubric, student_id):
+    def response_data_from_choices(self, info, rubric, student_id, response_type='tsv'):
         max_index = len(info['choices']) - 1
         response_data_choices = []
         for index, choice in enumerate(info['choices']):
             # If all choices result in an InvalidResponseError, reraise the last one.
             reraise = len(response_data_choices) == 0 and index == max_index
 
             if choice['message']['content']:
-                response_data = self.get_response_data_if_valid(choice['message']['content'], rubric, student_id, choice_index=index, reraise=reraise)
+                response_data = self.get_response_data_if_valid(choice['message']['content'], rubric, student_id, choice_index=index, reraise=reraise, response_type=response_type)
                 if response_data:
                     response_data_choices.append(response_data)
 
@@ -188,14 +188,19 @@ def compute_messages(self, prompt, rubric, student_code, examples=[]):
         messages.append({'role': 'user', 'content': student_code})
         return messages
 
-    def get_response_data_if_valid(self, response_text, rubric, student_id, choice_index=None, reraise=False):
+    def get_response_data_if_valid(self, response_text, rubric, student_id, choice_index=None, reraise=False, response_type='tsv'):
         choice_text = f"Choice {choice_index}: " if choice_index is not None else ''
         if not response_text:
             logging.error(f"{student_id} {choice_text} Invalid response: empty response")
             return None
         text = response_text.strip()
 
-        response_data = self.parse_non_json_response(text)
+        if response_type == 'json':
+            response_data = self.parse_json_response(text, student_id)
+        elif response_type == 'tsv':
+            response_data = self.parse_non_json_response(text)
+        else:
+            raise ValueError(f"Invalid response type: {response_type}")
 
         try:
             self._sanitize_server_response(response_data)
@@ -207,6 +212,22 @@ def get_response_data_if_valid(self, response_text, rubric, student_id, choice_i
                 raise e
             return None
 
+    def parse_json_response(self, response_text, student_id):
+        # capture all data from the first '[' to the last ']', inclusive
+        match = re.search(r'(\[.*\])', response_text,re.DOTALL)
+        if not match:
+            logging.error(f"{student_id} Invalid response: no valid JSON data:\n{response_text}")
+            return None
+        json_text = match.group(1)
+
+        try:
+            data = json.loads(json_text)
+        except json.JSONDecodeError as e:
+            logging.error(f"{student_id} JSON decoding error: {e}\n{json_text}")
+            return None
+
+        return data
+
     # parse response data in tsv, csv or markdown format.
     def parse_non_json_response(self, text):
         # Remove anything up to the first column name

diff --git a/lib/assessment/rubric_tester.py b/lib/assessment/rubric_tester.py
@@ -114,7 +114,7 @@ def get_params(prefix):
         params = json.load(f)
         validate_params(params)
         for k in params.keys():
-            if k == 'model':
+            if k in ['model', 'response-type']:
                 continue
             elif k == 'temperature':
                 params[k] = float(params[k])
@@ -125,7 +125,7 @@ def get_params(prefix):
 
 def validate_params(params):
     required_keys = ['model', 'num-responses', 'temperature']
-    allowed_keys = ['model', 'num-responses', 'temperature', 'remove-comments', 'num-passing-grades']
+    allowed_keys = ['model', 'num-responses', 'temperature', 'remove-comments', 'num-passing-grades', 'response-type']
     deprecated_keys = ['num-passing-grades']
     for k in required_keys:
         if k not in params:
@@ -137,6 +137,8 @@ def validate_params(params):
             logging.info(f"Deprecated key {k} in params.json. Please remove as this key has no effect.")
     if params['model'] not in SUPPORTED_MODELS:
         raise Exception(f"Unsupported LLM model: {params['model']}. Supported models are: {', '.join(SUPPORTED_MODELS)}")
+    if params.get('response-type', 'tsv') not in ['json', 'tsv']:
+        raise Exception(f"Unsupported response type: {params['response-type']}. Supported response types are: json, tsv")
 
 def get_student_files(max_num_students, prefix, student_ids=None):
     if student_ids:
@@ -162,14 +164,14 @@ def get_accuracy_thresholds(accuracy_threshold_file=accuracy_threshold_file, pre
     return thresholds
 
 
-def get_examples(prefix):
+def get_examples(prefix, response_type):
     example_js_files = sorted(glob.glob(os.path.join(prefix, 'examples', '*.js')))
     examples = []
     for example_js_file in example_js_files:
         example_id = os.path.splitext(os.path.basename(example_js_file))[0]
         with open(example_js_file, 'r') as f:
             example_code = f.read()
-        with open(os.path.join(prefix, 'examples', f"{example_id}.tsv"), 'r') as f:
+        with open(os.path.join(prefix, 'examples', f"{example_id}.{response_type}"), 'r') as f:
             example_rubric = f.read()
         examples.append((example_code, example_rubric))
     return examples
@@ -243,7 +245,7 @@ def compute_accuracy(actual_labels, predicted_labels, passing_labels):
     return accuracy_by_criteria, overall_accuracy, confusion_by_criteria, overall_confusion, label_names
 
 
-def read_and_label_student_work(prompt, rubric, student_file, examples, options, params, prefix):
+def read_and_label_student_work(prompt, rubric, student_file, examples, options, params, prefix, response_type):
     student_id = os.path.splitext(os.path.basename(student_file))[0]
     with open(student_file, 'r') as f:
         student_code = f.read()
@@ -261,6 +263,7 @@ def read_and_label_student_work(prompt, rubric, student_file, examples, options,
             temperature=options.temperature or params['temperature'],
             llm_model=options.llm_model or params['model'],
             remove_comments=options.remove_comments or params.get('remove-comments', False),
+            response_type=response_type,
             cache_prefix=prefix
         )
     except InvalidResponseError as e:
@@ -315,13 +318,14 @@ def main():
 
         # read in lesson files, validate them
         params = get_params(experiment_lesson_prefix)
+        response_type = params.get('response-type', 'tsv')
         prompt, standard_rubric = read_inputs(prompt_file, standard_rubric_file, experiment_lesson_prefix)
         student_files = get_student_files(options.max_num_students, dataset_lesson_prefix, student_ids=options.student_ids)
         if os.path.exists(os.path.join(dataset_lesson_prefix, actual_labels_file_old)):
             actual_labels = get_actual_labels(actual_labels_file_old, dataset_lesson_prefix)
         else:
             actual_labels = get_actual_labels(actual_labels_file, dataset_lesson_prefix)
-        examples = get_examples(experiment_lesson_prefix)
+        examples = get_examples(experiment_lesson_prefix, response_type)
 
         validate_rubrics(actual_labels, standard_rubric)
         validate_students(student_files, actual_labels)
@@ -337,7 +341,7 @@ def main():
 
         # call label function to either call openAI or read from cache
         with concurrent.futures.ThreadPoolExecutor(max_workers=7) as executor:
-            predicted_labels = list(executor.map(lambda student_file: read_and_label_student_work(prompt, rubric, student_file, examples, options, params, experiment_lesson_prefix), student_files))
+            predicted_labels = list(executor.map(lambda student_file: read_and_label_student_work(prompt, rubric, student_file, examples, options, params, experiment_lesson_prefix, response_type), student_files))
 
         errors = [student_id for student_id, labels in predicted_labels if not labels]
         # predicted_labels contains metadata and data (labels), we care about the data key
@@ -355,7 +359,8 @@ def main():
                 "model": options.llm_model or params['model'],
                 "num_responses": options.num_responses or params['num-responses'],
                 "temperature": options.temperature or params['temperature'],
-                "remove_comments": options.remove_comments or params.get('remove-comments', False)
+                "remove_comments": options.remove_comments or params.get('remove-comments', False),
+                "response_type": response_type,
             }
         }
         report = Report()

diff --git a/tests/data/example.json b/tests/data/example.json
@@ -0,0 +1,20 @@
+[
+    {
+        "Key Concept": "Modularity - Sprites and Sprite Properties",
+        "Observations": "(1) The sprites created are: backgroundSprite, snowman, santa. (2) The sprite properties updated inside the draw loop are: backgroundSprite.animation, santa.animation, santa.x, snowman.rotation. (3) All sprites have a property updated in the draw loop.",
+        "Grade": "Extensive Evidence",
+        "Reason": "At least 2 sprites are created, each with at least one property updating in the draw loop."
+    },
+    {
+        "Key Concept": "Position and Movement",
+        "Observations": "(1) The sprites placed on the screen are: backgroundSprite, snowman, santa. (2) No shapes are placed on the screen. (3) No text is placed on the screen. (4) The lines of code inside of the draw loop that update the position of sprites are: santa.x = santa.x + randomNumber(-1, 1), snowman.rotation++. (5) The sprite that uses random movement is santa. (6) The sprite that uses the counter pattern is snowman.",
+        "Grade": "Limited Evidence",
+        "Reason": "2 sprites are placed on the screen using the coordinate system, but no shapes or lines of text are created. At least 1 element moves during the program."
+    },
+    {
+        "Key Concept": "Optional \u201cStretch\u201d Feature - Variables",
+        "Observations": "(1) The variables created are: backgroundSprite, snowman, santa. The values stored in these variables are sprites. (2) No variables are created that are not used to store a sprite or a sprite property.",
+        "Grade": "No Evidence",
+        "Reason": "No non-sprite variables are created."
+    }
+]
diff --git a/tests/unit/assessment/conftest.py b/tests/unit/assessment/conftest.py
@@ -170,6 +170,13 @@ def remove_comments():
     yield random.randint(1, 2) == 1
 
 
+@pytest.fixture
+def response_type():
+    """ Creates a valid response_type value.
+    """
+
+    yield 'json'
+
 @pytest.fixture
 def student_id():
     """ Returns a reasonable student user id.
@@ -197,20 +204,6 @@ def gen_gpt_response(rubric=None, num_responses=3, disagreements=0, output_type=
         parsed_rubric = list(csv.DictReader(rubric.splitlines()))
         key_concepts = set(x['Key Concept'] for x in parsed_rubric)
 
-        def gen_rubric_response_header(delimiter='\t'):
-            return f"Key Concept{delimiter}Observations{delimiter}Grade{delimiter}Reason\n"
-
-        def gen_rubric_response_row(key_concept, label, delimiter='\t'):
-            return f"{key_concept}{delimiter}{randomstring(10)}{delimiter}{label}{delimiter}{randomstring(10)}\n"
-
-        delimiter = '\t'
-
-        if output_type == 'markdown':
-            delimiter = ' | '
-
-        if output_type == 'csv':
-            delimiter = ','
-
         assigned_labels = {}
         for key_concept in key_concepts:
             assigned_labels[key_concept] = random.choice([
@@ -222,8 +215,7 @@ def gen_rubric_response_row(key_concept, label, delimiter='\t'):
 
         disagreements_left = disagreements
         for i in range(0, num_responses):
-            content = gen_rubric_response_header(delimiter)
-
+            choice_data = []
             for key_concept in key_concepts:
                 label = assigned_labels[key_concept]
 
@@ -237,7 +229,12 @@ def gen_rubric_response_row(key_concept, label, delimiter='\t'):
                     ]) - set([label])))
                     disagreements_left -= 1
 
-                content += gen_rubric_response_row(key_concept, label, delimiter)
+                choice_data.append(gen_rubric_row_data(key_concept, label))
+
+            if output_type == 'json':
+                content = json.dumps(choice_data, indent=4)
+            else:
+                content = gen_tabular_response(choice_data, output_type)
 
             gpt_response['choices'].append({
                 'index': i,
@@ -249,5 +246,33 @@ def gen_rubric_response_row(key_concept, label, delimiter='\t'):
             })
 
         return gpt_response
-
+
+    def gen_rubric_row_data(key_concept, label):
+        return {
+            'Key Concept': key_concept,
+            'Observations': randomstring(10),
+            'Grade': label,
+            'Reason': randomstring(10)
+        }
+
+    def gen_tabular_response(choice_data, output_type):
+        delimiter = '\t'
+
+        if output_type == 'markdown':
+            delimiter = ' | '
+
+        if output_type == 'csv':
+            delimiter = ','
+
+        content = gen_tabular_response_header(delimiter)
+        for row_data in choice_data:
+            content += gen_tabular_response_row(row_data, delimiter)
+        return content
+
+    def gen_tabular_response_header(delimiter='\t'):
+        return f"Key Concept{delimiter}Observations{delimiter}Grade{delimiter}Reason\n"
+
+    def gen_tabular_response_row(data, delimiter='\t'):
+        return f"{data['Key Concept']}{delimiter}{data['Observations']}{delimiter}{data['Grade']}{delimiter}{data['Reason']}\n"
+
     return gen_gpt_response