Putting in place a fix for the document alignment problem that we had. We had broken the connection between doc and student with the reorder and were pulling in errors from inactive students. This presents a fix for both things.

DrLynch · DrLynch · commit 9f96e251dd01 · 2022-12-24T23:23:48.000-05:00
diff --git a/modules/writing_observer/writing_observer/aggregator.py b/modules/writing_observer/writing_observer/aggregator.py
@@ -1,4 +1,5 @@
 import time
+import traceback
 import learning_observer.util
 
 def excerpt_active_text(
@@ -125,7 +126,16 @@ def aggregate_course_summary_stats(student_data):
 async def get_latest_student_documents(student_data):
     '''
     This will retrieve the latest student documents from the database. It breaks
-    abstractions.
+    abstractions.  
+
+    It also involves some excess loops that are annoying but briefly we need to 
+    determine which students actually *have* last writing data. Then we need to 
+    go through and build keys for that data.  Then we fetch the data itself. 
+    Later on in this file we need to marry the information again.  This builds
+    up a series of lists which are successively merged into a single dict with
+    the resulting data.  
+
+    Some of what is copied along is clearly duplicative and probably unneeded.
     '''
     import learning_observer.kvs
 
@@ -134,6 +144,11 @@ async def get_latest_student_documents(student_data):
 
     kvs = learning_observer.kvs.KVS()
 
+    # Compile a list of the active students defined as those
+    # for which we have some "last_document"
+    active_students = [s for s in student_data if 'writing_observer.writing_analysis.last_document' in s]
+    
+    # Now collect documents for all of the active students.
     document_keys = ([
         learning_observer.stream_analytics.helpers.make_key(
             writing_observer.writing_analysis.reconstruct,
@@ -142,13 +157,29 @@ async def get_latest_student_documents(student_data):
                 EventField('doc_id'): s['writing_observer.writing_analysis.last_document']['document_id'] 
             },
             KeyStateType.INTERNAL
-        ) for s in student_data])
-
-    writing_data = await kvs.multiget(keys=document_keys)
+        ) for s in active_students]) # in student_data if 'writing_observer.writing_analysis.last_document' in s])
+    
+    kvs_data = await kvs.multiget(keys=document_keys)
 
+    
     # Return blank entries if no data, rather than None. This makes it possible
-    # to use item.get with defaults sanely.
-    writing_data = [{} if item is None else item for item in writing_data]
+    # to use item.get with defaults sanely.  For the sake of later alignment
+    # we also zip up the items with the keys and users that they come from
+    # this hack allows us to align them after cleaning occurrs later.
+    # writing_data = [{} if item is None else item for item in writing_data]
+    writing_data = []
+    for idx in range(len(document_keys)):
+        student = active_students[idx]
+        doc     = kvs_data[idx]
+        
+        # If we have an empty item we simply return an empty dict with the
+        # student but an empty doc value.
+        if (doc is None): doc = {}
+
+        # Now insert the student data and pass it along.
+        doc['student'] = student
+        writing_data.append(doc)
+        
     return writing_data
 
 
@@ -167,6 +198,7 @@ async def merge_with_student_data(writing_data, student_data):
     '''
     Add the student metadata to each text
     '''
+
     for item, student in zip(writing_data, student_data):
         if 'edit_metadata' in item:
             del item['edit_metadata']
@@ -179,16 +211,35 @@ async def merge_with_student_data(writing_data, student_data):
 async def latest_data(student_data):
     '''
     HACK HACK HACK
-
-    I just hardcoded this, breaking abstractions, repeating code, etc.
+    
+    This code needs to take the student data as a dict and then 
+    collect the latest writing data for each student (assuming 
+    they have it). The code then passes that writing data on 
+    to Paul's code for processing.  For the time being this 
+    works by essentially building up some large dicts that 
+    contain the text and student data together.  
+
+    In the long run this should *all* be replaced by a cleaner
+    object interface that hides some of this from the user 
+    but for the now we'll roll with this.  
     '''
+    # Get the latest documents with the students appended.
     writing_data = await get_latest_student_documents(student_data)
+
+    # Strip out the unnecessary extra data.
     writing_data = await remove_extra_data(writing_data)
-    writing_data = await merge_with_student_data(writing_data, student_data)
+
+    # The merge process is not needed at the moment because we are making
+    # a simple alignment.
+    #writing_data = await merge_with_student_data(writing_data, student_data)
+
     just_the_text = [w.get("text", "") for w in writing_data]
+
     annotated_texts = await writing_observer.awe_nlp.process_texts_parallel(just_the_text)
+
     for annotated_text, single_doc in zip(annotated_texts, writing_data):
         if annotated_text != "Error":
             single_doc.update(annotated_text)
     # Call Paul's code to add stuff to it
+
     return {'latest_writing_data': writing_data}