fix(ingest): Update metadata hash calculation, do not include empty f…

…ields in hash (#2877)
loculus-project · Sep 26, 2024 · ec5ce24 · ec5ce24
1 parent a6a0c9f
commit ec5ce24
Show file tree

Hide file tree

Showing 3 changed files with 14 additions and 2 deletions.
diff --git a/ingest/README.md b/ingest/README.md
@@ -42,6 +42,8 @@ Every sequence entry is to be uploaded only once and must be ignored by future p
 
 To achieve this, an md5 hash is generated for each sequence entry based on the post-transform metadata and sequence content. The hash is based on all metadata fields submitted to Loculus as well as the sequence. Hence, changes to the ingest pipeline's transform step (above) can lead to changes in hash and resubmission - even without underlying data change on INSDC. Likewise, some changes to the INSDC data might not cause a sequence update on Loculus if what has been changed does not affect the post-transformed metadata.
 
+To allow for addition and removal of metadata fields without a version bump across all samples we only take a hash of fields with a value. For example the hash of a sample where the field "is_lab_host" is empty is equal to the hash of that same sample without the "is_lab_host" field.
+
 For segmented viruses we calculate the md5 hash of each segment and then, after grouping segments we concatenate the hashes of each segment before again hashing the hashes.
 
 ### Grouping segmented viruses

diff --git a/ingest/scripts/group_segments.py b/ingest/scripts/group_segments.py
@@ -232,8 +232,13 @@ def main(
 
         row["submissionId"] = joint_key
 
+        # Hash of all metadata fields should be the same if
+        # 1. field is not in keys_to_keep and
+        # 2. field is in keys_to_keep but is "" or None
+        filtered_record = {k: str(v) for k, v in row.items() if v is not None and str(v)}
+
         row["hash"] = hashlib.md5(
-            json.dumps(row, sort_keys=True).encode(), usedforsecurity=False
+            json.dumps(filtered_record, sort_keys=True).encode(), usedforsecurity=False
         ).hexdigest()
 
         metadata[joint_key] = row

diff --git a/ingest/scripts/prepare_metadata.py b/ingest/scripts/prepare_metadata.py
@@ -138,7 +138,12 @@ def main(
             msg = f"No hash found for {record[config.fasta_id_field]}"
             raise ValueError(msg)
 
-        metadata_dump = json.dumps(record, sort_keys=True)
+        # Hash of all metadata fields should be the same if
+        # 1. field is not in keys_to_keep and
+        # 2. field is in keys_to_keep but is "" or None
+        filtered_record = {k: str(v) for k, v in record.items() if v is not None and str(v)}
+
+        metadata_dump = json.dumps(filtered_record, sort_keys=True)
         prehash = metadata_dump + sequence_hash
 
         record["hash"] = hashlib.md5(prehash.encode(), usedforsecurity=False).hexdigest()