-
Notifications
You must be signed in to change notification settings - Fork 4
Open
Labels
Description
Problem Summary
Referential integrity testing identified 10 tables with invalid foreign key references, totaling 273,091 orphaned records. This excludes concept mappings and external system table references.
Findings
- APPOINTMENT.patient_id → PATIENT.id: 1 invalid reference
- APPOINTMENT_PRACTITIONER.appointment_id → APPOINTMENT.id: 46,730 invalid references
- ENCOUNTER.patient_id → PATIENT.id: 24 invalid references
- ENCOUNTER.appointment_id → APPOINTMENT.id: 24 invalid references
- ENCOUNTER.episode_of_care_id → EPISODE_OF_CARE.id: 12 invalid references
- LOCATION.managing_organisation_id → ORGANISATION.id: 1,170 invalid references
- MEDICATION_ORDER.referral_request_id → REFERRAL_REQUEST.id: 4 invalid references
- MEDICATION_STATEMENT.referral_request_id → REFERRAL_REQUEST.id: 227 invalid references
- PATIENT_REGISTERED_PRACTITIONER_IN_ROLE.person_id → PERSON.id: 34,610 invalid references
- SCHEDULE.location_id → LOCATION.id: 190,289 invalid references
Impact
These referential integrity violations would have a low-medium impact on analyses that rely on Appointment and Location data. Counts for other failures are low and will have a limited impact.
The person_id failure relates to known issue #17
Test Evidence
Snowpark Python Script:
import snowflake.snowpark as snowpark
def main(session: snowpark.Session):
db = "Data_Store_OLIDS_UAT"
schema = "OLIDS_MASKED"
session.sql(f'USE DATABASE "{db}"').collect()
session.sql(f'USE SCHEMA "{schema}"').collect()
# Get all available columns to validate relationships
columns_query = f"""
SELECT table_name, column_name
FROM "{db}".INFORMATION_SCHEMA.COLUMNS
WHERE table_schema = '{schema}'
"""
available_columns = session.sql(columns_query).collect()
column_lookup = {(row['TABLE_NAME'], row['COLUMN_NAME']) for row in available_columns}
# All table-to-table relationships (excluding concept mappings and external system references)
relationships = [
('ALLERGY_INTOLERANCE', 'patient_id', 'PATIENT', 'id'),
('ALLERGY_INTOLERANCE', 'person_id', 'PERSON', 'id'),
('ALLERGY_INTOLERANCE', 'encounter_id', 'ENCOUNTER', 'id'),
('ALLERGY_INTOLERANCE', 'practitioner_id', 'PRACTITIONER', 'id'),
('APPOINTMENT', 'patient_id', 'PATIENT', 'id'),
('APPOINTMENT', 'person_id', 'PERSON', 'id'),
('APPOINTMENT', 'organisation_id', 'ORGANISATION', 'id'),
('APPOINTMENT', 'practitioner_in_role_id', 'PRACTITIONER_IN_ROLE', 'id'),
('APPOINTMENT', 'schedule_id', 'SCHEDULE', 'id'),
('APPOINTMENT_PRACTITIONER', 'appointment_id', 'APPOINTMENT', 'id'),
('APPOINTMENT_PRACTITIONER', 'practitioner_id', 'PRACTITIONER', 'id'),
('DIAGNOSTIC_ORDER', 'patient_id', 'PATIENT', 'id'),
('DIAGNOSTIC_ORDER', 'person_id', 'PERSON', 'id'),
('DIAGNOSTIC_ORDER', 'encounter_id', 'ENCOUNTER', 'id'),
('DIAGNOSTIC_ORDER', 'practitioner_id', 'PRACTITIONER', 'id'),
('DIAGNOSTIC_ORDER', 'parent_observation_id', 'OBSERVATION', 'id'),
('ENCOUNTER', 'patient_id', 'PATIENT', 'id'),
('ENCOUNTER', 'person_id', 'PERSON', 'id'),
('ENCOUNTER', 'appointment_id', 'APPOINTMENT', 'id'),
('ENCOUNTER', 'episode_of_care_id', 'EPISODE_OF_CARE', 'id'),
('ENCOUNTER', 'practitioner_id', 'PRACTITIONER', 'id'),
('ENCOUNTER', 'service_provider_organisation_id', 'ORGANISATION', 'id'),
('EPISODE_OF_CARE', 'patient_id', 'PATIENT', 'id'),
('EPISODE_OF_CARE', 'person_id', 'PERSON', 'id'),
('EPISODE_OF_CARE', 'organisation_id', 'ORGANISATION', 'id'),
('EPISODE_OF_CARE', 'care_manager_practitioner_id', 'PRACTITIONER', 'id'),
('FLAG', 'patient_id', 'PATIENT', 'id'),
('FLAG', 'person_id', 'PERSON', 'id'),
('LOCATION', 'managing_organisation_id', 'ORGANISATION', 'id'),
('LOCATION_CONTACT', 'location_id', 'LOCATION', 'id'),
('MEDICATION_ORDER', 'patient_id', 'PATIENT', 'id'),
('MEDICATION_ORDER', 'person_id', 'PERSON', 'id'),
('MEDICATION_ORDER', 'encounter_id', 'ENCOUNTER', 'id'),
('MEDICATION_ORDER', 'practitioner_id', 'PRACTITIONER', 'id'),
('MEDICATION_ORDER', 'organisation_id', 'ORGANISATION', 'id'),
('MEDICATION_ORDER', 'allergy_intolerance_id', 'ALLERGY_INTOLERANCE', 'id'),
('MEDICATION_ORDER', 'diagnostic_order_id', 'DIAGNOSTIC_ORDER', 'id'),
('MEDICATION_ORDER', 'medication_statement_id', 'MEDICATION_STATEMENT', 'id'),
('MEDICATION_ORDER', 'observation_id', 'OBSERVATION', 'id'),
('MEDICATION_ORDER', 'referral_request_id', 'REFERRAL_REQUEST', 'id'),
('MEDICATION_STATEMENT', 'patient_id', 'PATIENT', 'id'),
('MEDICATION_STATEMENT', 'person_id', 'PERSON', 'id'),
('MEDICATION_STATEMENT', 'encounter_id', 'ENCOUNTER', 'id'),
('MEDICATION_STATEMENT', 'practitioner_id', 'PRACTITIONER', 'id'),
('MEDICATION_STATEMENT', 'organisation_id', 'ORGANISATION', 'id'),
('MEDICATION_STATEMENT', 'allergy_intolerance_id', 'ALLERGY_INTOLERANCE', 'id'),
('MEDICATION_STATEMENT', 'diagnostic_order_id', 'DIAGNOSTIC_ORDER', 'id'),
('MEDICATION_STATEMENT', 'observation_id', 'OBSERVATION', 'id'),
('MEDICATION_STATEMENT', 'referral_request_id', 'REFERRAL_REQUEST', 'id'),
('OBSERVATION', 'patient_id', 'PATIENT', 'id'),
('OBSERVATION', 'person_id', 'PERSON', 'id'),
('OBSERVATION', 'encounter_id', 'ENCOUNTER', 'id'),
('OBSERVATION', 'parent_obervation_id', 'OBSERVATION', 'id'),
('OBSERVATION', 'practioner_id', 'PRACTITIONER', 'id'),
('ORGANISATION', 'parent_organisation_id', 'ORGANISATION', 'id'),
('PATIENT', 'registered_practice_id', 'ORGANISATION', 'id'),
('PATIENT_ADDRESS', 'patient_id', 'PATIENT', 'id'),
('PATIENT_ADDRESS', 'person_id', 'PERSON', 'id'),
('PATIENT_CONTACT', 'patient_id', 'PATIENT', 'id'),
('PATIENT_CONTACT', 'person_id', 'PERSON', 'id'),
('PATIENT_PERSON', 'patient_id', 'PATIENT', 'id'),
('PATIENT_PERSON', 'person_id', 'PERSON', 'id'),
('PATIENT_REGISTERED_PRACTITIONER_IN_ROLE', 'patient_id', 'PATIENT', 'id'),
('PATIENT_REGISTERED_PRACTITIONER_IN_ROLE', 'person_id', 'PERSON', 'id'),
('PATIENT_REGISTERED_PRACTITIONER_IN_ROLE', 'practitioner_id', 'PRACTITIONER', 'id'),
('PATIENT_REGISTERED_PRACTITIONER_IN_ROLE', 'organisation_id', 'ORGANISATION', 'id'),
('PATIENT_REGISTERED_PRACTITIONER_IN_ROLE', 'episode_of_care_id', 'EPISODE_OF_CARE', 'id'),
('PERSON', 'requesting_patient_id', 'PATIENT', 'id'),
('PRACTITIONER_IN_ROLE', 'practitioner_id', 'PRACTITIONER', 'id'),
('PRACTITIONER_IN_ROLE', 'organisation_id', 'ORGANISATION', 'id'),
('PROCEDURE_REQUEST', 'patient_id', 'PATIENT', 'id'),
('PROCEDURE_REQUEST', 'person_id', 'PERSON', 'id'),
('PROCEDURE_REQUEST', 'encounter_id', 'ENCOUNTER', 'id'),
('PROCEDURE_REQUEST', 'practitioner_id', 'PRACTITIONER', 'id'),
('REFERRAL_REQUEST', 'patient_id', 'PATIENT', 'id'),
('REFERRAL_REQUEST', 'person_id', 'PERSON', 'id'),
('REFERRAL_REQUEST', 'encounter_id', 'ENCOUNTER', 'id'),
('REFERRAL_REQUEST', 'practitioner_id', 'PRACTITIONER', 'id'),
('REFERRAL_REQUEST', 'organisation_id', 'ORGANISATION', 'id'),
('REFERRAL_REQUEST', 'recipient_organisation_id', 'ORGANISATION', 'id'),
('REFERRAL_REQUEST', 'requester_organisation_id', 'ORGANISATION', 'id'),
('SCHEDULE', 'location_id', 'LOCATION', 'id'),
('SCHEDULE', 'practitioner_id', 'PRACTITIONER', 'id'),
('SCHEDULE_PRACTITIONER', 'practitioner_id', 'PRACTITIONER', 'id'),
('SCHEDULE_PRACTITIONER', 'schedule_id', 'SCHEDULE', 'id'),
]
results = []
total_invalid = 0
skipped_count = 0
for source_table, fk_column, ref_table, ref_column in relationships:
# Check if both columns exist before testing the relationship
source_exists = (source_table, fk_column) in column_lookup
ref_exists = (ref_table, ref_column) in column_lookup
if not source_exists or not ref_exists:
skipped_count += 1
missing = []
if not source_exists:
missing.append(f"{source_table}.{fk_column}")
if not ref_exists:
missing.append(f"{ref_table}.{ref_column}")
results.append({
'SOURCE_TABLE': source_table,
'FOREIGN_KEY': fk_column,
'REFERENCED_TABLE': f"{ref_table}.{ref_column}",
'INVALID_REFERENCES': f"SKIPPED: Missing {', '.join(missing)}"
})
continue
try:
# Find records with foreign keys that don't exist in the referenced table
query = f"""
SELECT COUNT(*) as invalid_count
FROM "{db}"."{schema}"."{source_table}" src
LEFT JOIN "{db}"."{schema}"."{ref_table}" ref
ON src."{fk_column}" = ref."{ref_column}"
WHERE src."{fk_column}" IS NOT NULL
AND ref."{ref_column}" IS NULL
"""
result = session.sql(query).collect()[0]
invalid_count = result['INVALID_COUNT']
if invalid_count > 0:
total_invalid += invalid_count
results.append({
'SOURCE_TABLE': source_table,
'FOREIGN_KEY': fk_column,
'REFERENCED_TABLE': f"{ref_table}.{ref_column}",
'INVALID_REFERENCES': invalid_count
})
except Exception as e:
results.append({
'SOURCE_TABLE': source_table,
'FOREIGN_KEY': fk_column,
'REFERENCED_TABLE': f"{ref_table}.{ref_column}",
'INVALID_REFERENCES': f"ERROR: {str(e)[:50]}"
})
# Add summary
tested_count = len(relationships) - skipped_count
if not any(isinstance(r.get("INVALID_REFERENCES"), int) and r["INVALID_REFERENCES"] > 0 for r in results):
results.insert(0, {
'SOURCE_TABLE': 'SUMMARY',
'FOREIGN_KEY': f'Tested {tested_count} relationships, skipped {skipped_count}',
'REFERENCED_TABLE': 'All referential integrity checks passed',
'INVALID_REFERENCES': 0
})
else:
tables_with_issues = len([r for r in results if isinstance(r.get("INVALID_REFERENCES"), int) and r["INVALID_REFERENCES"] > 0])
results.insert(0, {
'SOURCE_TABLE': 'SUMMARY',
'FOREIGN_KEY': f'Tested {tested_count} relationships, skipped {skipped_count}',
'REFERENCED_TABLE': f'Found {tables_with_issues} tables with invalid references',
'INVALID_REFERENCES': total_invalid
})
return session.create_dataframe(results)Metadata
Metadata
Assignees
Labels
Type
Projects
Status
To fix ⛏️
