sunlabuiuc · cpaa4 · May 7, 2025
diff --git a/pyhealth/conflict_patients.csv b/pyhealth/conflict_patients.csv
@@ -0,0 +1,9 @@
+hadm_id,medications,medication_cuis,allergies,allergy_cuis
+22841357,"['Acetaminophen', 'Albuterol Inhaler Q6Hprn Wheezin', 'Calcium Carbonate', 'Furosemide', 'Lactulose', 'Ralte', 'Rifaximin', 'Sulfameth/Trimethoprim Ds', 'Tiotropium Bromide']","['161', '435', '1897', '4603', '6218', '719872', '35619', '10180', '69120']",['Percocet'],['161']
+22927623,['Omeprazole'],['283742'],['Omeprazole'],['283742']
+22634923,"['Acetaminophen', 'Docusate Sodium', 'Hydromorphone M', 'Albuterol Inhaler Q6Hprn Wheezin', 'Levothyroxine Sodium', 'Prenatal Vitamins']","['161', '82003', '3423', '435', '10582', '237076']","['Percocet', 'Cucumber']","['161', '1305719']"
+20798638,['Acetaminophen'],['161'],"['Percocet', 'Cucumber', 'Tegaderm']","['161', '1305719', '10450']"
+27683372,"['Acetaminophen', 'Azathioprine', 'Fluoxetine', 'Fluticasone Propionate', 'Hydromorphone', 'Metoprolol Succinate Xl', 'Omeprazole', 'Prednisone', 'Simvastatin', 'Tiotropium Bromide']","['161', '1256', '4493', '41126', '3423', '6918', '283742', '8640', '36567', '69120']","['Ragweed', 'Morphine', 'Percocet']","['124363', '7052', '161']"
+29460260,"['Docusate Sodium', 'Senna', 'Pantoprazole', 'Simvastatin', 'Multivitamin', 'Lisinopril', 'Fluticasone', 'Amitriptyline', 'Aspirin', 'Compression Stockin', 'Acetaminophen']","['82003', '36387', '40790', '36567', '604365', '29046', '41126', '704', '1191', '408374', '161']","['Nifedipine Er', 'Amitriptyline', 'Prilosec Otc', 'Terazosin']","['284802', '704', '405277', '37798']"
+23352834,"['Amitriptyline', 'Fluticasone', 'Lasix', 'Lisinopril', 'Pantoprazole', 'Potassium Chloride', 'Simvastatin', 'Acetaminophen', 'Aspirin', 'Docusate Sodium', 'Multivitamintxminerals', 'Zofran']","['704', '41126', '4603', '29046', '40790', '8591', '36567', '161', '1191', '82003', None, '26225']","['Nifedipine', 'Amitriptyline', 'Prilosec Otc', 'Terazosin']","['7417', '704', '405277', '37798']"
+29391916,"['Amitriptyline', 'Aspirin', 'Dabi', 'Fluticasone Propionate Nasal Spry Nu Qpm', 'Lidocaine Patch Ptch Td Rle Pain', 'Lisinopril', 'Multivitamins W/Minerals', 'Pantoprazole', 'Pravastatin', 'Docusate Sodium', 'Amlodipine', 'Outpatient Lab Work Please Draw Chem Ca M']","['704', '1191', '1546356', '2661274', '2668917', '29046', '235368', '40790', '42463', '82003', '17767', '215977']","['Nifedipine', 'Amitriptyline', 'Prilosec Otc', 'Terazosin']","['7417', '704', '405277', '37798']"
diff --git a/pyhealth/datasets/__init__.py b/pyhealth/datasets/__init__.py
@@ -43,3 +43,4 @@ def __init__(self, *args, **kwargs):
 from .tuab import TUABDataset
 from .tuev import TUEVDataset
 from .utils import collate_fn_dict, collate_fn_dict_with_padding, get_dataloader
+from .med_allergy_conflict_dataset import MedAllergyConflictDataset
diff --git a/pyhealth/datasets/med_allergy_conflict_dataset.py b/pyhealth/datasets/med_allergy_conflict_dataset.py
@@ -0,0 +1,254 @@
+import polars as pl
+from pyhealth.datasets import BaseDataset
+import os
+import gzip
+import shutil
+import json
+import pandas as pd
+import re
+import requests
+import time
+from typing import Dict, Optional
+from tqdm import tqdm
+
+
+class MedAllergyConflictDataset(BaseDataset):
+    STOPWORDS = {
+        'tab', 'tablet', 'po', 'daily', 'chewable', 'cap', 'capsule', 'unit',
+        'sc', 'si', 'bid', 'prn', 'ml', 'mg', 'drop', 'puff', 'tid', 'qid',
+        'qh', 'qhs', 'ih', 'inh', 'soln', 'suspension', 'ointment', 'ophth', 'sol'
+    }
+
+    PLACEHOLDERS = {'___', 'nka', 'none', 'n/a'}
+
+    def __init__(self, root: str, **kwargs):
+        config_path = os.path.join(root, "pyhealth.yaml")
+
+        if not os.path.exists(config_path):
+            with open(config_path, "w") as f:
+                f.write("""
+dataset_name: MedAllergyConflictDataset
+version: 1.0.0
+description: Dummy config for medication-allergy conflict detection.
+tables:
+  dummy_table:
+    path: dummy.csv
+    file_path: dummy.csv
+    type: custom
+    attributes: ["dummy_attr"]
+""")
+
+        super().__init__(
+            dataset_name="MedAllergyConflictDataset",
+            root=root,
+            tables=[],
+            dev=kwargs.get("dev", False),
+            config_path=config_path
+        )
+
+        self.patients = {}
+        self.extract_dir = os.path.join(root, "file")
+        os.makedirs(self.extract_dir, exist_ok=True)
+
+        self.gz_path = os.path.join(root, "note", "discharge.csv.gz")
+        self.csv_path = os.path.join(self.extract_dir, "discharge.csv")
+        self.med_cache_path = os.path.join(self.extract_dir, "med_cache.json")
+        self.allergy_cache_path = os.path.join(self.extract_dir, "allergy_cache.json")
+
+        self._load_data()
+
+    def load_data(self):
+        return pl.DataFrame([])
+
+    def _load_data(self):
+        if not os.path.exists(self.csv_path):
+            with gzip.open(self.gz_path, 'rb') as f_in:
+                with open(self.csv_path, 'wb') as f_out:
+                    shutil.copyfileobj(f_in, f_out)
+
+        self.df = pd.read_csv(self.csv_path)
+        self.rxcui_atc_cache = self._load_json(self.med_cache_path)
+        self.allergy_cui_cache = self._load_json(self.allergy_cache_path)
+        self._preprocess()
+
+    def _load_json(self, path: str) -> Dict:
+        if os.path.exists(path):
+            with open(path, 'r') as f:
+                return json.load(f)
+        return {}
+
+    def _save_json(self, obj: Dict, path: str):
+        with open(path, 'w') as f:
+            json.dump(obj, f)
+
+    def _normalize_med_name(self, name: str) -> str:
+        name = name.lower()
+        name = re.sub(r'[^a-z0-9\s/-]', '', name)
+        tokens = name.split()
+        filtered = [t for t in tokens if t not in self.STOPWORDS and not t.isdigit()]
+        return ' '.join(filtered).strip().title()
+
+    def _extract_allergies(self, text):
+        match = re.search(r"Allergies:\s*(.*?)(?:\n|$)", text)
+        return match.group(1).strip() if match else None
+
+    def _extract_medications(self, text):
+        start = text.find("Discharge Medications:")
+        if start == -1:
+            return []
+        end = text.find("\n\n", start)
+        section = text[start:end].replace('Discharge Medications:', '').strip() if end != -1 else text[start:]
+        entries = re.split(r'\n?\d+\.\s+', section)
+        meds = []
+        for entry in entries:
+            name = re.split(r'\d+ *mg|mEq|mcg|g|units', entry, flags=re.IGNORECASE)[0]
+            name = re.sub(r'\(.*?\)', '', name)
+            name = re.sub(r'[^a-zA-Z0-9\s/]+', '', name)
+            cleaned = self._normalize_med_name(name)
+            if cleaned:
+                meds.append(cleaned)
+        return meds
+
+    def _preprocess(self):
+        print("Starting preprocessing loop...")
+        count = 0
+        for _, row in self.df.iterrows():
+            if count >= 200:
+                break
+            hadm_id = str(row['hadm_id'])
+            text = row['text']
+            allergies = self._extract_allergies(text)
+            meds = self._extract_medications(text)
+            if not allergies or not meds:
+                continue
+
+            parsed_allergies = [a.strip().title() for a in re.split(r'[,/]', allergies) if a.strip()]
+            if all(a.lower() in self.PLACEHOLDERS for a in parsed_allergies):
+                continue
+
+            allergy_cuis = [self._resolve_allergy_cui(a) for a in parsed_allergies if a]
+            med_cuis = [self._resolve_med_ingredient_cui(m) for m in meds if m]
+
+            self.patients[hadm_id] = {
+                "hadm_id": hadm_id,
+                "allergies": parsed_allergies,
+                "medications": meds,
+                "allergy_cuis": allergy_cuis,
+                "medication_cuis": med_cuis,
+            }
+            count += 1
+            if count % 10 == 0:
+                print(f"Processed {count} patients...")
+
+        print(f"Finished preprocessing. Total patients processed: {count}")
+        self._save_json(self.rxcui_atc_cache, self.med_cache_path)
+        self._save_json(self.allergy_cui_cache, self.allergy_cache_path)
+
+    def _resolve_allergy_cui(self, term: str, max_retries=3) -> Optional[str]:
+        base_url = "https://rxnav.nlm.nih.gov/REST"
+        term = term.strip().title()
+        if term in self.allergy_cui_cache:
+            return self.allergy_cui_cache[term]
+
+        retries = 0
+        ingredient_cui = None
+        while retries < max_retries:
+            try:
+                r = requests.get(f"{base_url}/rxcui.json", params={"name": term, "search": 1}, timeout=5)
+                r.raise_for_status()
+                rxcui = r.json().get("idGroup", {}).get("rxnormId", [None])[0]
+                if not rxcui:
+                    a = requests.get(f"{base_url}/approximateTerm.json", params={"term": term}, timeout=5)
+                    a.raise_for_status()
+                    candidates = a.json().get("approximateGroup", {}).get("candidate", [])
+                    rxcui = candidates[0].get("rxcui") if candidates else None
+                if not rxcui:
+                    break
+                rel = requests.get(f"{base_url}/rxcui/{rxcui}/related.json", params={"tty": "IN"}, timeout=5)
+                rel.raise_for_status()
+                concept_group = rel.json().get("relatedGroup", {}).get("conceptGroup", [])
+                if concept_group:
+                    props = concept_group[0].get("conceptProperties", [])
+                    if props:
+                        ingredient_cui = props[0]['rxcui']
+                if not ingredient_cui:
+                    ingredient_cui = rxcui
+                break
+            except requests.exceptions.RequestException:
+                pass
+            retries += 1
+            time.sleep(1)
+
+        self.allergy_cui_cache[term] = ingredient_cui
+        return ingredient_cui
+
+    def _resolve_med_ingredient_cui(self, drug_name: str, max_retries=3) -> Optional[str]:
+        base_url = "https://rxnav.nlm.nih.gov/REST"
+        drug_name = drug_name.strip().title()
+        if drug_name in self.rxcui_atc_cache:
+            return self.rxcui_atc_cache[drug_name][2]
+
+        retries = 0
+        rxcui = atc_code = ingredient_cui = None
+        while retries < max_retries:
+            try:
+                r = requests.get(f"{base_url}/rxcui.json", params={"name": drug_name, "search": 1}, timeout=5)
+                r.raise_for_status()
+                rxcui = r.json().get("idGroup", {}).get("rxnormId", [None])[0]
+                if not rxcui:
+                    approx = requests.get(f"{base_url}/approximateTerm.json", params={"term": drug_name}, timeout=5)
+                    approx.raise_for_status()
+                    candidates = approx.json().get("approximateGroup", {}).get("candidate", [])
+                    rxcui = candidates[0]['rxcui'] if candidates else None
+                if not rxcui:
+                    break
+                atc_resp = requests.get(f"{base_url}/rxcui/{rxcui}/class.json", timeout=5)
+                if atc_resp.status_code == 200:
+                    atc_json = atc_resp.json()
+                    atc_codes = [
+                        c["rxclassMinConceptItem"]["classId"]
+                        for c in atc_json.get("rxclassDrugInfoList", {}).get("rxclassDrugInfo", [])
+                        if "ATC" in c["rxclassMinConceptItem"]["className"]
+                    ]
+                    atc_code = atc_codes[0] if atc_codes else None
+                rel_resp = requests.get(f"{base_url}/rxcui/{rxcui}/related.json", params={"tty": "IN"}, timeout=5)
+                rel_resp.raise_for_status()
+                related = rel_resp.json().get("relatedGroup", {}).get("conceptGroup", [])
+                if related:
+                    props = related[0].get("conceptProperties", [])
+                    if props:
+                        ingredient_cui = props[0]['rxcui']
+                if not ingredient_cui:
+                    ingredient_cui = rxcui
+                break
+            except requests.exceptions.RequestException:
+                pass
+            retries += 1
+            time.sleep(1)
+
+        self.rxcui_atc_cache[drug_name] = [rxcui, atc_code, ingredient_cui]
+        return ingredient_cui
+
+    def get_all_patient_ids(self):
+        return list(self.patients.keys())
+
+    def get_patient_by_id(self, patient_id: str) -> Optional[Dict]:
+        return self.patients.get(patient_id)
+
+    def export_medications(self, output_path: str):
+        rows = []
+        for p in self.patients.values():
+            for med, cui in zip(p["medications"], p["medication_cuis"]):
+                rows.append({"hadm_id": p["hadm_id"], "medication": med, "ingredient_cui": cui})
+        pd.DataFrame(rows).to_csv(output_path, index=False)
+
+    def export_allergies(self, output_path: str):
+        rows = []
+        for p in self.patients.values():
+            for allergen, cui in zip(p["allergies"], p["allergy_cuis"]):
+                rows.append({"hadm_id": p["hadm_id"], "allergy": allergen, "allergy_cui": cui})
+        pd.DataFrame(rows).to_csv(output_path, index=False)
+
+if __name__ == "__main__":
+    dataset = MedAllergyConflictDataset(root="/your/data/path")
+    print(f"Total patients: {len(dataset.get_all_patient_ids())}")
diff --git a/pyhealth/examples/README.md b/pyhealth/examples/README.md
@@ -0,0 +1,21 @@
+# Medication-Allergy Conflict Detection Task
+
+This module adds a custom PyHealth dataset and task for detecting conflicts between prescribed medications and known patient allergies using MIMIC-IV discharge summaries.
+
+## Features
+
+- Parses free-text discharge notes to extract medications and allergies
+- Normalizes terms using RxNorm APIs to get ingredient CUIs
+- Detects conflicts based on matching CUIs between meds and allergies
+- Provides a PyHealth task interface (`get_label()`, `__call__()`, `export_conflicts()`)
+
+## Files
+
+- `med_allergy_conflict_dataset.py` – loads and processes MIMIC-IV discharge summaries
+- `allergy_conflict_task.py` – PyHealth task that detects CUI-level conflicts
+- `test_allergy_conflict_task.py` – unit tests for the task and dataset
+
+## How to Run
+
+```bash
+python pyhealth/tasks/allergy_conflict_task.py
diff --git a/pyhealth/examples/image.png b/pyhealth/examples/image.png
diff --git a/pyhealth/examples/run_med_allergy_example.py b/pyhealth/examples/run_med_allergy_example.py
@@ -0,0 +1,26 @@
+print("🚀 STARTING run_med_allergy_example.py")
+
+import pandas as pd
+from pyhealth.tasks.allergy_conflict_task import AllergyConflictDetectionTask
+from pyhealth.datasets.med_allergy_conflict_dataset import MedAllergyConflictDataset  # Adjust if in a different file
+
+def main():
+    # Step 1: Set path to the data folder containing 'note/discharge.csv.gz'
+    root_path = "/Users/royal/Documents/Pyhealth_testing/mimic-iv-note-deidentified-free-text-clinical-notes-2"  
+
+    # Step 2: Initialize dataset
+    dataset = MedAllergyConflictDataset(root=root_path)
+
+    # Step 3: Run the conflict detection task
+    task = AllergyConflictDetectionTask(dataset)
+
+    # Step 4: Export conflicts to CSV
+    task.export_conflicts(output_path="conflict_patients.csv")
+
+    # Step 5: Preview the output
+    df = pd.read_csv("conflict_patients.csv")
+    print("\n🧾 Preview of exported conflicts:")
+    print(df.head())
+
+if __name__ == "__main__":
+    main()
diff --git a/pyhealth/tasks/__init__.py b/pyhealth/tasks/__init__.py
@@ -48,3 +48,5 @@
 )
 from .sleep_staging_v2 import SleepStagingSleepEDF
 from .temple_university_EEG_tasks import EEG_events_fn, EEG_isAbnormal_fn
+from .allergy_conflict_task import AllergyConflictDetectionTask
+
diff --git a/pyhealth/tasks/allergy_conflict_task.py b/pyhealth/tasks/allergy_conflict_task.py
@@ -0,0 +1,71 @@
+# ------------------------------------------------------------------------------
+# Author: Chidiebere Anichebe , Naveen Baskaran
+# NetID: cpa4@illinois.edu, nc42#illinois.edu
+# Contribution: Medication-Allergy Conflict Detection Task 
+# Paper Title: A Data-Centric Approach to Generate Faithful and High-Quality Patient Summaries with Large Language Models
+# Paper Link: https://physionet.org/content/ann-pt-summ/1.0.1/
+# Description:
+#   Labels whether any medication ingredient CUI in a patient admission
+#   matches a known allergy CUI.
+# ------------------------------------------------------------------------------
+
+
+from pyhealth.tasks.base_task import BaseTask
+from typing import Dict, Any
+import pandas as pd
+from pyhealth.datasets.med_allergy_conflict_dataset import MedAllergyConflictDataset
+
+
+class AllergyConflictDetectionTask(BaseTask):
+    """
+    A PyHealth Task that detects whether a medication-allergy conflict exists for a given patient.
+    Returns 1 if any ingredient CUI from medications appears in the allergy CUIs, else 0.
+    """
+
+    def __init__(self, dataset, **kwargs):
+        self.dataset = dataset
+        self.feature_keys = ["medication_cuis", "allergy_cuis"]
+        self.label_key = "conflict"
+
+    def get_label(self, sample: Dict[str, Any]) -> int:
+        med_cuis = set(str(cui) for cui in sample.get("medication_cuis", []) if cui)
+        allergy_cuis = set(str(cui) for cui in sample.get("allergy_cuis", []) if cui)
+        return int(not med_cuis.isdisjoint(allergy_cuis))
+
+    def __call__(self, sample: Dict[str, Any]) -> Dict[str, Any]:
+        label = self.get_label(sample)
+        return {"label": label}
+
+    def export_conflicts(self, output_path: str = "conflict_patients.csv"):
+        """
+        Iterates through the dataset and exports all patients with medication-allergy conflicts.
+
+        Args:
+            output_path (str): Path to save the CSV file with conflict details.
+        """
+        conflict_rows = []
+        for pid in self.dataset.get_all_patient_ids():
+            patient = self.dataset.get_patient_by_id(pid)
+            if not patient:
+                continue
+
+            label = self.get_label(patient)
+            if label == 1:
+                conflict_rows.append({
+                    "hadm_id": pid,
+                    "medications": patient.get("medications"),
+                    "medication_cuis": patient.get("medication_cuis"),
+                    "allergies": patient.get("allergies"),
+                    "allergy_cuis": patient.get("allergy_cuis")
+                })
+
+        df = pd.DataFrame(conflict_rows)
+        df.to_csv(output_path, index=False)
+        print(f"Exported {len(df)} conflict cases to {output_path}")
+
+if __name__ == "__main__":
+    from pyhealth.datasets.med_allergy_conflict_dataset import MedAllergyConflictDataset
+
+    dataset = MedAllergyConflictDataset(root="/Users/royal/Documents/Pyhealth_testing/mimic-iv-note-deidentified-free-text-clinical-notes-2")
+    task = AllergyConflictDetectionTask(dataset)
+    task.export_conflicts("conflict_patients.csv")