Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
114 changes: 51 additions & 63 deletions executor/executor/executor.py
Original file line number Diff line number Diff line change
Expand Up @@ -340,6 +340,8 @@ def earthmover_run(self):
self.logger.info(f"Student ID types in Ed-Fi roster: {os.environ['EDFI_STUDENT_ID_TYPES']}")

self.check_input_encoding()
fatal = False

try:
encoding_mod = []
if self.input_sources["INPUT_FILE"]["is_plausible_non_utf8"]:
Expand Down Expand Up @@ -385,27 +387,23 @@ def earthmover_run(self):
except subprocess.CalledProcessError:
# failed again, move on to shutdown procedure
pass
finally:
# the app relies on the presence of the unmatched students file but does not check
# whether it is populated. It is possible that Earthmover successfully matches students
# (so the file is empty) but then fails during data transformation. We need to check the
# match rates whether or not Earthmover succeeds, so that we don't accidentally tell the
# user there are unmatched students when there are none
self.record_highest_match_rate()
if fatal:
# shut it down
if count_unmatched_students() == 0:
# the app relies on the presence of the unmatched students file but does not check
# whether it is populated. It is possible that Earthmover successfully matches students
# (so the file is empty) but then fails during data transformation. By skipping the
# upload, we avoid the possibility of telling the user there were unmatched students when
# in fact there were none
self.logger.debug("no unmatched students. Skipping upload")
artifact.UNMATCHED_STUDENTS.needs_upload = False
self.error = error.EarthmoverRunError()
raise

self.upload_artifact(artifact.EM_RESULTS)
self.upload_artifact(artifact.MATCH_RATES)

# If we reach this point, it's pretty likely that the input file was basically compatible with
# the bundle. However, we still need to check whether student ID matching was at least partially
# successful. It used to be that the student_ids bundle would halt execution when there weren't
# enough matches, but those days are over; we handle it ourselves.
self.handle_unmatched_students()
# If we reach this point, it's likely that the input file was basically compatible with the bundle
self.report_unmatched_students()

def check_input_encoding(self):
"""Determine whether assessment file should be loaded with a non-UTF-8 encoding"""
Expand Down Expand Up @@ -515,65 +513,66 @@ def unpack_id_types(self):
else:
self.stu_unique_id_in_roster = None

def handle_unmatched_students(self):
"""Alert the app to the existence of unmatched students (if any) and the best candidate for ID matching"""
def record_highest_match_rate(self):
"""Read the match rates file to determine which ID was the best match and how many student IDs failed to match it"""
# Let's make one thing very clear: There is an unmatched students file and a match rates file.
# The unmatched students file is like the input file, filtered for any records that do not match the
# **primary** ID used. e.g. If 6/10 of the input records match a state ID and 2/10 others match a
# local ID, then the 4/10 without matching state IDs go in the unmatched students file, but both types
# of match are recorded in the match rates file. Both are used for different purposes by the app and
# within the exeuctor

# if there are no unmatched students, there is no unmatched students file
num_unmatched = count_unmatched_students()
if num_unmatched == 0:
self.logger.debug("no unmatched students. Skipping upload")
# mark so that we don't try to upload it at the end
artifact.UNMATCHED_STUDENTS.needs_upload = False
return

self.logger.warning('earthmover run produced unmatched inputs')

with open(artifact.MATCH_RATES.path) as f:
match_rates = [
{k: v for k, v in row.items()}
for row in csv.DictReader(f, skipinitialspace=True)
]

sufficient_matches = False
highest_match_rate = 0.0
highest_match_id_name = "N/A"
highest_match_id_type = "N/A"
# in the case of literally zero matches, the file is empty and these defaults remain
self.highest_match_rate = 0.0
self.highest_match_id_name = "N/A"
self.highest_match_id_type = "N/A"
self.num_unmatched_students = 0

# (a) if there are truly no matches, the match rates file has a header and no entries
if len(match_rates) > 0:
self.logger.info(f"some records matched - match rates by ID: {match_rates}")

highest_match = sorted(match_rates, reverse=True, key=lambda mr: float(mr['match_rate']))[0]
highest_match_rate = float(highest_match["match_rate"])
highest_match_id_name = highest_match["source_column_name"]
highest_match_id_type = highest_match["edfi_column_name"]

# (b)
if highest_match_rate >= config.REQUIRED_ID_MATCH_RATE:
# in this case, there are unmatched students but not so many that we doubt the
# integrity of the file. Send the ones that match and give the rest back to the user
sufficient_matches = True
# if an "actual" ID is replicated as studentUniqueId, we should send the actual ID to the user
if highest_match_id_type == "studentUniqueId" and self.stu_unique_id_in_roster:
highest_match_id_type = self.stu_unique_id_in_roster

# additional context so the app can help the user fix their file
self.send_id_matches(highest_match_id_name, highest_match_id_type, num_unmatched)
self.upload_artifact(artifact.UNMATCHED_STUDENTS)
if not sufficient_matches:
# If we made it here, then either (a) or (b) is FALSE and thus the input file is no good.
# Don't bother uploading anything and instead show the user a specific error
self.error = error.InsufficientMatchesError(highest_match_rate, config.REQUIRED_ID_MATCH_RATE, highest_match_id_name, highest_match_id_type)
self.highest_match_rate = float(highest_match["match_rate"])
self.highest_match_id_name = highest_match["source_column_name"]
self.highest_match_id_type = highest_match["edfi_column_name"]
self.num_unmatched_students = int(highest_match["num_rows"]) - int(highest_match["num_matches"])

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

(Feel free to take or leave this comment - it's not introduced by this change, just something I noticed while reviewing it.)

The sorting of match_rates could be done by earthmover, rather than here. The best_id_match transformation already does; id_match_rates could as well.

if self.num_unmatched_students == 0:
self.logger.debug("no unmatched students. Skipping upload of unmatched students file")
artifact.UNMATCHED_STUDENTS.needs_upload = False

def report_unmatched_students(self):
"""Alert the app to the existence of unmatched students (if any) and the best candidate for ID matching"""
if self.num_unmatched_students == 0:
return

self.logger.warning('earthmover run failed to match some student IDs')

if self.highest_match_rate >= config.REQUIRED_ID_MATCH_RATE:
# in this case, there are unmatched students but not so many that we doubt the
# integrity of the file. Send the ones that match and give the rest back to the user
# if an "actual" ID is replicated as studentUniqueId, we should send the actual ID to the user
id_type_to_report = self.highest_match_id_type
if self.highest_match_id_type == "studentUniqueId" and self.stu_unique_id_in_roster:
id_type_to_report = self.stu_unique_id_in_roster

# additional context so the app can help the user fix their file
self.send_id_matches(self.highest_match_id_name, id_type_to_report, self.num_unmatched_students)
self.upload_artifact(artifact.UNMATCHED_STUDENTS)
else:
# Insufficient matches. Assume the input file is no good Don't bother uploading anything.
# Instead, alert the user with an error
self.error = error.InsufficientMatchesError(self.highest_match_rate, config.REQUIRED_ID_MATCH_RATE, self.highest_match_id_name, self.highest_match_id_type)
# For now, since we're asking the user to revisit their entire file, it's simpler if we don't
# return the unmatched students file at all
self.logger.debug("too many unmatched students. Skipping upload")
artifact.UNMATCHED_STUDENTS.needs_upload = False
raise ValueError(f"insufficient ID matches to continue (highest rate {highest_match_rate} < required {config.REQUIRED_ID_MATCH_RATE}; ID column name: {highest_match_id_name}; Ed-Fi ID type: {highest_match_id_type})")
raise ValueError(f"insufficient ID matches to continue (highest rate {self.highest_match_rate} < required {config.REQUIRED_ID_MATCH_RATE}; ID column name: {self.highest_match_id_name}; Ed-Fi ID type: {self.highest_match_id_type})")

def lightbeam_send(self):
"""Upload Earthmover's outputs to the ODS"""
Expand Down Expand Up @@ -703,17 +702,6 @@ def send_summary(self):
self.conn.post(self.summary_url, json=self.summary)


def count_unmatched_students():
"""Returns the number of data rows in Earthmover's unmatched students file"""
try:
with open(artifact.UNMATCHED_STUDENTS.path) as f:
# header doesn't count - also, when you open this file in an IDE, it may appear to have an extra blank line,
# which would lead to false positives when there are no unmatched students. Python and programs like `wc -l`
# nonetheless appear to read it in as a single-line file, so this is correct
return max(sum(1 for _ in f) - 1, 0)
except FileNotFoundError:
return 0

def localize_s3_path(path):
"""Convert an S3 'path' to a single filename"""
return path.replace("/", "__")
2 changes: 1 addition & 1 deletion executor/requirements.txt
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
awscli
boto3
chardet
earthmover==0.4.7
earthmover==0.4.8
lightbeam==0.1.10
requests