Skip to content

OLIDS_MASKED: Referential Integrity Test Failures #22

@EddieDavison92

Description

@EddieDavison92

Problem Summary

Referential integrity testing identified 10 tables with invalid foreign key references, totaling 273,091 orphaned records. This excludes concept mappings and external system table references.

Findings

  • APPOINTMENT.patient_id → PATIENT.id: 1 invalid reference
  • APPOINTMENT_PRACTITIONER.appointment_id → APPOINTMENT.id: 46,730 invalid references
  • ENCOUNTER.patient_id → PATIENT.id: 24 invalid references
  • ENCOUNTER.appointment_id → APPOINTMENT.id: 24 invalid references
  • ENCOUNTER.episode_of_care_id → EPISODE_OF_CARE.id: 12 invalid references
  • LOCATION.managing_organisation_id → ORGANISATION.id: 1,170 invalid references
  • MEDICATION_ORDER.referral_request_id → REFERRAL_REQUEST.id: 4 invalid references
  • MEDICATION_STATEMENT.referral_request_id → REFERRAL_REQUEST.id: 227 invalid references
  • PATIENT_REGISTERED_PRACTITIONER_IN_ROLE.person_id → PERSON.id: 34,610 invalid references
  • SCHEDULE.location_id → LOCATION.id: 190,289 invalid references

Impact

These referential integrity violations would have a low-medium impact on analyses that rely on Appointment and Location data. Counts for other failures are low and will have a limited impact.

The person_id failure relates to known issue #17

Test Evidence

Image

Snowpark Python Script:

import snowflake.snowpark as snowpark

def main(session: snowpark.Session):
    db = "Data_Store_OLIDS_UAT"
    schema = "OLIDS_MASKED"
    
    session.sql(f'USE DATABASE "{db}"').collect()
    session.sql(f'USE SCHEMA "{schema}"').collect()
    
    # Get all available columns to validate relationships
    columns_query = f"""
    SELECT table_name, column_name
    FROM "{db}".INFORMATION_SCHEMA.COLUMNS 
    WHERE table_schema = '{schema}'
    """
    available_columns = session.sql(columns_query).collect()
    column_lookup = {(row['TABLE_NAME'], row['COLUMN_NAME']) for row in available_columns}
    
    # All table-to-table relationships (excluding concept mappings and external system references)
    relationships = [
        ('ALLERGY_INTOLERANCE', 'patient_id', 'PATIENT', 'id'),
        ('ALLERGY_INTOLERANCE', 'person_id', 'PERSON', 'id'),
        ('ALLERGY_INTOLERANCE', 'encounter_id', 'ENCOUNTER', 'id'),
        ('ALLERGY_INTOLERANCE', 'practitioner_id', 'PRACTITIONER', 'id'),
        
        ('APPOINTMENT', 'patient_id', 'PATIENT', 'id'),
        ('APPOINTMENT', 'person_id', 'PERSON', 'id'),
        ('APPOINTMENT', 'organisation_id', 'ORGANISATION', 'id'),
        ('APPOINTMENT', 'practitioner_in_role_id', 'PRACTITIONER_IN_ROLE', 'id'),
        ('APPOINTMENT', 'schedule_id', 'SCHEDULE', 'id'),
        
        ('APPOINTMENT_PRACTITIONER', 'appointment_id', 'APPOINTMENT', 'id'),
        ('APPOINTMENT_PRACTITIONER', 'practitioner_id', 'PRACTITIONER', 'id'),
        
        ('DIAGNOSTIC_ORDER', 'patient_id', 'PATIENT', 'id'),
        ('DIAGNOSTIC_ORDER', 'person_id', 'PERSON', 'id'),
        ('DIAGNOSTIC_ORDER', 'encounter_id', 'ENCOUNTER', 'id'),
        ('DIAGNOSTIC_ORDER', 'practitioner_id', 'PRACTITIONER', 'id'),
        ('DIAGNOSTIC_ORDER', 'parent_observation_id', 'OBSERVATION', 'id'),
        
        ('ENCOUNTER', 'patient_id', 'PATIENT', 'id'),
        ('ENCOUNTER', 'person_id', 'PERSON', 'id'),
        ('ENCOUNTER', 'appointment_id', 'APPOINTMENT', 'id'),
        ('ENCOUNTER', 'episode_of_care_id', 'EPISODE_OF_CARE', 'id'),
        ('ENCOUNTER', 'practitioner_id', 'PRACTITIONER', 'id'),
        ('ENCOUNTER', 'service_provider_organisation_id', 'ORGANISATION', 'id'),
        
        ('EPISODE_OF_CARE', 'patient_id', 'PATIENT', 'id'),
        ('EPISODE_OF_CARE', 'person_id', 'PERSON', 'id'),
        ('EPISODE_OF_CARE', 'organisation_id', 'ORGANISATION', 'id'),
        ('EPISODE_OF_CARE', 'care_manager_practitioner_id', 'PRACTITIONER', 'id'),
        
        ('FLAG', 'patient_id', 'PATIENT', 'id'),
        ('FLAG', 'person_id', 'PERSON', 'id'),
        
        ('LOCATION', 'managing_organisation_id', 'ORGANISATION', 'id'),
        ('LOCATION_CONTACT', 'location_id', 'LOCATION', 'id'),
        
        ('MEDICATION_ORDER', 'patient_id', 'PATIENT', 'id'),
        ('MEDICATION_ORDER', 'person_id', 'PERSON', 'id'),
        ('MEDICATION_ORDER', 'encounter_id', 'ENCOUNTER', 'id'),
        ('MEDICATION_ORDER', 'practitioner_id', 'PRACTITIONER', 'id'),
        ('MEDICATION_ORDER', 'organisation_id', 'ORGANISATION', 'id'),
        ('MEDICATION_ORDER', 'allergy_intolerance_id', 'ALLERGY_INTOLERANCE', 'id'),
        ('MEDICATION_ORDER', 'diagnostic_order_id', 'DIAGNOSTIC_ORDER', 'id'),
        ('MEDICATION_ORDER', 'medication_statement_id', 'MEDICATION_STATEMENT', 'id'),
        ('MEDICATION_ORDER', 'observation_id', 'OBSERVATION', 'id'),
        ('MEDICATION_ORDER', 'referral_request_id', 'REFERRAL_REQUEST', 'id'),
        
        ('MEDICATION_STATEMENT', 'patient_id', 'PATIENT', 'id'),
        ('MEDICATION_STATEMENT', 'person_id', 'PERSON', 'id'),
        ('MEDICATION_STATEMENT', 'encounter_id', 'ENCOUNTER', 'id'),
        ('MEDICATION_STATEMENT', 'practitioner_id', 'PRACTITIONER', 'id'),
        ('MEDICATION_STATEMENT', 'organisation_id', 'ORGANISATION', 'id'),
        ('MEDICATION_STATEMENT', 'allergy_intolerance_id', 'ALLERGY_INTOLERANCE', 'id'),
        ('MEDICATION_STATEMENT', 'diagnostic_order_id', 'DIAGNOSTIC_ORDER', 'id'),
        ('MEDICATION_STATEMENT', 'observation_id', 'OBSERVATION', 'id'),
        ('MEDICATION_STATEMENT', 'referral_request_id', 'REFERRAL_REQUEST', 'id'),
        
        ('OBSERVATION', 'patient_id', 'PATIENT', 'id'),
        ('OBSERVATION', 'person_id', 'PERSON', 'id'),
        ('OBSERVATION', 'encounter_id', 'ENCOUNTER', 'id'),
        ('OBSERVATION', 'parent_obervation_id', 'OBSERVATION', 'id'),
        ('OBSERVATION', 'practioner_id', 'PRACTITIONER', 'id'),
        
        ('ORGANISATION', 'parent_organisation_id', 'ORGANISATION', 'id'),
        ('PATIENT', 'registered_practice_id', 'ORGANISATION', 'id'),
        
        ('PATIENT_ADDRESS', 'patient_id', 'PATIENT', 'id'),
        ('PATIENT_ADDRESS', 'person_id', 'PERSON', 'id'),
        
        ('PATIENT_CONTACT', 'patient_id', 'PATIENT', 'id'),
        ('PATIENT_CONTACT', 'person_id', 'PERSON', 'id'),
        
        ('PATIENT_PERSON', 'patient_id', 'PATIENT', 'id'),
        ('PATIENT_PERSON', 'person_id', 'PERSON', 'id'),
        
        ('PATIENT_REGISTERED_PRACTITIONER_IN_ROLE', 'patient_id', 'PATIENT', 'id'),
        ('PATIENT_REGISTERED_PRACTITIONER_IN_ROLE', 'person_id', 'PERSON', 'id'),
        ('PATIENT_REGISTERED_PRACTITIONER_IN_ROLE', 'practitioner_id', 'PRACTITIONER', 'id'),
        ('PATIENT_REGISTERED_PRACTITIONER_IN_ROLE', 'organisation_id', 'ORGANISATION', 'id'),
        ('PATIENT_REGISTERED_PRACTITIONER_IN_ROLE', 'episode_of_care_id', 'EPISODE_OF_CARE', 'id'),
        
        ('PERSON', 'requesting_patient_id', 'PATIENT', 'id'),
        
        ('PRACTITIONER_IN_ROLE', 'practitioner_id', 'PRACTITIONER', 'id'),
        ('PRACTITIONER_IN_ROLE', 'organisation_id', 'ORGANISATION', 'id'),
        
        ('PROCEDURE_REQUEST', 'patient_id', 'PATIENT', 'id'),
        ('PROCEDURE_REQUEST', 'person_id', 'PERSON', 'id'),
        ('PROCEDURE_REQUEST', 'encounter_id', 'ENCOUNTER', 'id'),
        ('PROCEDURE_REQUEST', 'practitioner_id', 'PRACTITIONER', 'id'),
        
        ('REFERRAL_REQUEST', 'patient_id', 'PATIENT', 'id'),
        ('REFERRAL_REQUEST', 'person_id', 'PERSON', 'id'),
        ('REFERRAL_REQUEST', 'encounter_id', 'ENCOUNTER', 'id'),
        ('REFERRAL_REQUEST', 'practitioner_id', 'PRACTITIONER', 'id'),
        ('REFERRAL_REQUEST', 'organisation_id', 'ORGANISATION', 'id'),
        ('REFERRAL_REQUEST', 'recipient_organisation_id', 'ORGANISATION', 'id'),
        ('REFERRAL_REQUEST', 'requester_organisation_id', 'ORGANISATION', 'id'),
        
        ('SCHEDULE', 'location_id', 'LOCATION', 'id'),
        ('SCHEDULE', 'practitioner_id', 'PRACTITIONER', 'id'),
        
        ('SCHEDULE_PRACTITIONER', 'practitioner_id', 'PRACTITIONER', 'id'),
        ('SCHEDULE_PRACTITIONER', 'schedule_id', 'SCHEDULE', 'id'),
    ]
    
    results = []
    total_invalid = 0
    skipped_count = 0
    
    for source_table, fk_column, ref_table, ref_column in relationships:
        # Check if both columns exist before testing the relationship
        source_exists = (source_table, fk_column) in column_lookup
        ref_exists = (ref_table, ref_column) in column_lookup
        
        if not source_exists or not ref_exists:
            skipped_count += 1
            missing = []
            if not source_exists:
                missing.append(f"{source_table}.{fk_column}")
            if not ref_exists:
                missing.append(f"{ref_table}.{ref_column}")
            
            results.append({
                'SOURCE_TABLE': source_table,
                'FOREIGN_KEY': fk_column,
                'REFERENCED_TABLE': f"{ref_table}.{ref_column}",
                'INVALID_REFERENCES': f"SKIPPED: Missing {', '.join(missing)}"
            })
            continue
        
        try:
            # Find records with foreign keys that don't exist in the referenced table
            query = f"""
            SELECT COUNT(*) as invalid_count
            FROM "{db}"."{schema}"."{source_table}" src
            LEFT JOIN "{db}"."{schema}"."{ref_table}" ref 
                ON src."{fk_column}" = ref."{ref_column}"
            WHERE src."{fk_column}" IS NOT NULL 
                AND ref."{ref_column}" IS NULL
            """
            
            result = session.sql(query).collect()[0]
            invalid_count = result['INVALID_COUNT']
            
            if invalid_count > 0:
                total_invalid += invalid_count
                results.append({
                    'SOURCE_TABLE': source_table,
                    'FOREIGN_KEY': fk_column,
                    'REFERENCED_TABLE': f"{ref_table}.{ref_column}",
                    'INVALID_REFERENCES': invalid_count
                })
                
        except Exception as e:
            results.append({
                'SOURCE_TABLE': source_table,
                'FOREIGN_KEY': fk_column,
                'REFERENCED_TABLE': f"{ref_table}.{ref_column}",
                'INVALID_REFERENCES': f"ERROR: {str(e)[:50]}"
            })
    
    # Add summary
    tested_count = len(relationships) - skipped_count
    if not any(isinstance(r.get("INVALID_REFERENCES"), int) and r["INVALID_REFERENCES"] > 0 for r in results):
        results.insert(0, {
            'SOURCE_TABLE': 'SUMMARY',
            'FOREIGN_KEY': f'Tested {tested_count} relationships, skipped {skipped_count}',
            'REFERENCED_TABLE': 'All referential integrity checks passed',
            'INVALID_REFERENCES': 0
        })
    else:
        tables_with_issues = len([r for r in results if isinstance(r.get("INVALID_REFERENCES"), int) and r["INVALID_REFERENCES"] > 0])
        results.insert(0, {
            'SOURCE_TABLE': 'SUMMARY',
            'FOREIGN_KEY': f'Tested {tested_count} relationships, skipped {skipped_count}',
            'REFERENCED_TABLE': f'Found {tables_with_issues} tables with invalid references',
            'INVALID_REFERENCES': total_invalid
        })
    
    return session.create_dataframe(results)

Metadata

Metadata

Assignees

Labels

Projects

Status

To fix ⛏️

Milestone

No milestone

Relationships

None yet

Development

No branches or pull requests

Issue actions