Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 9 additions & 0 deletions pyhealth/conflict_patients.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
hadm_id,medications,medication_cuis,allergies,allergy_cuis
22841357,"['Acetaminophen', 'Albuterol Inhaler Q6Hprn Wheezin', 'Calcium Carbonate', 'Furosemide', 'Lactulose', 'Ralte', 'Rifaximin', 'Sulfameth/Trimethoprim Ds', 'Tiotropium Bromide']","['161', '435', '1897', '4603', '6218', '719872', '35619', '10180', '69120']",['Percocet'],['161']
22927623,['Omeprazole'],['283742'],['Omeprazole'],['283742']
22634923,"['Acetaminophen', 'Docusate Sodium', 'Hydromorphone M', 'Albuterol Inhaler Q6Hprn Wheezin', 'Levothyroxine Sodium', 'Prenatal Vitamins']","['161', '82003', '3423', '435', '10582', '237076']","['Percocet', 'Cucumber']","['161', '1305719']"
20798638,['Acetaminophen'],['161'],"['Percocet', 'Cucumber', 'Tegaderm']","['161', '1305719', '10450']"
27683372,"['Acetaminophen', 'Azathioprine', 'Fluoxetine', 'Fluticasone Propionate', 'Hydromorphone', 'Metoprolol Succinate Xl', 'Omeprazole', 'Prednisone', 'Simvastatin', 'Tiotropium Bromide']","['161', '1256', '4493', '41126', '3423', '6918', '283742', '8640', '36567', '69120']","['Ragweed', 'Morphine', 'Percocet']","['124363', '7052', '161']"
29460260,"['Docusate Sodium', 'Senna', 'Pantoprazole', 'Simvastatin', 'Multivitamin', 'Lisinopril', 'Fluticasone', 'Amitriptyline', 'Aspirin', 'Compression Stockin', 'Acetaminophen']","['82003', '36387', '40790', '36567', '604365', '29046', '41126', '704', '1191', '408374', '161']","['Nifedipine Er', 'Amitriptyline', 'Prilosec Otc', 'Terazosin']","['284802', '704', '405277', '37798']"
23352834,"['Amitriptyline', 'Fluticasone', 'Lasix', 'Lisinopril', 'Pantoprazole', 'Potassium Chloride', 'Simvastatin', 'Acetaminophen', 'Aspirin', 'Docusate Sodium', 'Multivitamintxminerals', 'Zofran']","['704', '41126', '4603', '29046', '40790', '8591', '36567', '161', '1191', '82003', None, '26225']","['Nifedipine', 'Amitriptyline', 'Prilosec Otc', 'Terazosin']","['7417', '704', '405277', '37798']"
29391916,"['Amitriptyline', 'Aspirin', 'Dabi', 'Fluticasone Propionate Nasal Spry Nu Qpm', 'Lidocaine Patch Ptch Td Rle Pain', 'Lisinopril', 'Multivitamins W/Minerals', 'Pantoprazole', 'Pravastatin', 'Docusate Sodium', 'Amlodipine', 'Outpatient Lab Work Please Draw Chem Ca M']","['704', '1191', '1546356', '2661274', '2668917', '29046', '235368', '40790', '42463', '82003', '17767', '215977']","['Nifedipine', 'Amitriptyline', 'Prilosec Otc', 'Terazosin']","['7417', '704', '405277', '37798']"
1 change: 1 addition & 0 deletions pyhealth/datasets/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -43,3 +43,4 @@ def __init__(self, *args, **kwargs):
from .tuab import TUABDataset
from .tuev import TUEVDataset
from .utils import collate_fn_dict, collate_fn_dict_with_padding, get_dataloader
from .med_allergy_conflict_dataset import MedAllergyConflictDataset
254 changes: 254 additions & 0 deletions pyhealth/datasets/med_allergy_conflict_dataset.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,254 @@
import polars as pl
from pyhealth.datasets import BaseDataset
import os
import gzip
import shutil
import json
import pandas as pd
import re
import requests
import time
from typing import Dict, Optional
from tqdm import tqdm


class MedAllergyConflictDataset(BaseDataset):
STOPWORDS = {
'tab', 'tablet', 'po', 'daily', 'chewable', 'cap', 'capsule', 'unit',
'sc', 'si', 'bid', 'prn', 'ml', 'mg', 'drop', 'puff', 'tid', 'qid',
'qh', 'qhs', 'ih', 'inh', 'soln', 'suspension', 'ointment', 'ophth', 'sol'
}

PLACEHOLDERS = {'___', 'nka', 'none', 'n/a'}

def __init__(self, root: str, **kwargs):
config_path = os.path.join(root, "pyhealth.yaml")

if not os.path.exists(config_path):
with open(config_path, "w") as f:
f.write("""
dataset_name: MedAllergyConflictDataset
version: 1.0.0
description: Dummy config for medication-allergy conflict detection.
tables:
dummy_table:
path: dummy.csv
file_path: dummy.csv
type: custom
attributes: ["dummy_attr"]
""")

super().__init__(
dataset_name="MedAllergyConflictDataset",
root=root,
tables=[],
dev=kwargs.get("dev", False),
config_path=config_path
)

self.patients = {}
self.extract_dir = os.path.join(root, "file")
os.makedirs(self.extract_dir, exist_ok=True)

self.gz_path = os.path.join(root, "note", "discharge.csv.gz")
self.csv_path = os.path.join(self.extract_dir, "discharge.csv")
self.med_cache_path = os.path.join(self.extract_dir, "med_cache.json")
self.allergy_cache_path = os.path.join(self.extract_dir, "allergy_cache.json")

self._load_data()

def load_data(self):
return pl.DataFrame([])

def _load_data(self):
if not os.path.exists(self.csv_path):
with gzip.open(self.gz_path, 'rb') as f_in:
with open(self.csv_path, 'wb') as f_out:
shutil.copyfileobj(f_in, f_out)

self.df = pd.read_csv(self.csv_path)
self.rxcui_atc_cache = self._load_json(self.med_cache_path)
self.allergy_cui_cache = self._load_json(self.allergy_cache_path)
self._preprocess()

def _load_json(self, path: str) -> Dict:
if os.path.exists(path):
with open(path, 'r') as f:
return json.load(f)
return {}

def _save_json(self, obj: Dict, path: str):
with open(path, 'w') as f:
json.dump(obj, f)

def _normalize_med_name(self, name: str) -> str:
name = name.lower()
name = re.sub(r'[^a-z0-9\s/-]', '', name)
tokens = name.split()
filtered = [t for t in tokens if t not in self.STOPWORDS and not t.isdigit()]
return ' '.join(filtered).strip().title()

def _extract_allergies(self, text):
match = re.search(r"Allergies:\s*(.*?)(?:\n|$)", text)
return match.group(1).strip() if match else None

def _extract_medications(self, text):
start = text.find("Discharge Medications:")
if start == -1:
return []
end = text.find("\n\n", start)
section = text[start:end].replace('Discharge Medications:', '').strip() if end != -1 else text[start:]
entries = re.split(r'\n?\d+\.\s+', section)
meds = []
for entry in entries:
name = re.split(r'\d+ *mg|mEq|mcg|g|units', entry, flags=re.IGNORECASE)[0]
name = re.sub(r'\(.*?\)', '', name)
name = re.sub(r'[^a-zA-Z0-9\s/]+', '', name)
cleaned = self._normalize_med_name(name)
if cleaned:
meds.append(cleaned)
return meds

def _preprocess(self):
print("Starting preprocessing loop...")
count = 0
for _, row in self.df.iterrows():
if count >= 200:
break
hadm_id = str(row['hadm_id'])
text = row['text']
allergies = self._extract_allergies(text)
meds = self._extract_medications(text)
if not allergies or not meds:
continue

parsed_allergies = [a.strip().title() for a in re.split(r'[,/]', allergies) if a.strip()]
if all(a.lower() in self.PLACEHOLDERS for a in parsed_allergies):
continue

allergy_cuis = [self._resolve_allergy_cui(a) for a in parsed_allergies if a]
med_cuis = [self._resolve_med_ingredient_cui(m) for m in meds if m]

self.patients[hadm_id] = {
"hadm_id": hadm_id,
"allergies": parsed_allergies,
"medications": meds,
"allergy_cuis": allergy_cuis,
"medication_cuis": med_cuis,
}
count += 1
if count % 10 == 0:
print(f"Processed {count} patients...")

print(f"Finished preprocessing. Total patients processed: {count}")
self._save_json(self.rxcui_atc_cache, self.med_cache_path)
self._save_json(self.allergy_cui_cache, self.allergy_cache_path)

def _resolve_allergy_cui(self, term: str, max_retries=3) -> Optional[str]:
base_url = "https://rxnav.nlm.nih.gov/REST"
term = term.strip().title()
if term in self.allergy_cui_cache:
return self.allergy_cui_cache[term]

retries = 0
ingredient_cui = None
while retries < max_retries:
try:
r = requests.get(f"{base_url}/rxcui.json", params={"name": term, "search": 1}, timeout=5)
r.raise_for_status()
rxcui = r.json().get("idGroup", {}).get("rxnormId", [None])[0]
if not rxcui:
a = requests.get(f"{base_url}/approximateTerm.json", params={"term": term}, timeout=5)
a.raise_for_status()
candidates = a.json().get("approximateGroup", {}).get("candidate", [])
rxcui = candidates[0].get("rxcui") if candidates else None
if not rxcui:
break
rel = requests.get(f"{base_url}/rxcui/{rxcui}/related.json", params={"tty": "IN"}, timeout=5)
rel.raise_for_status()
concept_group = rel.json().get("relatedGroup", {}).get("conceptGroup", [])
if concept_group:
props = concept_group[0].get("conceptProperties", [])
if props:
ingredient_cui = props[0]['rxcui']
if not ingredient_cui:
ingredient_cui = rxcui
break
except requests.exceptions.RequestException:
pass
retries += 1
time.sleep(1)

self.allergy_cui_cache[term] = ingredient_cui
return ingredient_cui

def _resolve_med_ingredient_cui(self, drug_name: str, max_retries=3) -> Optional[str]:
base_url = "https://rxnav.nlm.nih.gov/REST"
drug_name = drug_name.strip().title()
if drug_name in self.rxcui_atc_cache:
return self.rxcui_atc_cache[drug_name][2]

retries = 0
rxcui = atc_code = ingredient_cui = None
while retries < max_retries:
try:
r = requests.get(f"{base_url}/rxcui.json", params={"name": drug_name, "search": 1}, timeout=5)
r.raise_for_status()
rxcui = r.json().get("idGroup", {}).get("rxnormId", [None])[0]
if not rxcui:
approx = requests.get(f"{base_url}/approximateTerm.json", params={"term": drug_name}, timeout=5)
approx.raise_for_status()
candidates = approx.json().get("approximateGroup", {}).get("candidate", [])
rxcui = candidates[0]['rxcui'] if candidates else None
if not rxcui:
break
atc_resp = requests.get(f"{base_url}/rxcui/{rxcui}/class.json", timeout=5)
if atc_resp.status_code == 200:
atc_json = atc_resp.json()
atc_codes = [
c["rxclassMinConceptItem"]["classId"]
for c in atc_json.get("rxclassDrugInfoList", {}).get("rxclassDrugInfo", [])
if "ATC" in c["rxclassMinConceptItem"]["className"]
]
atc_code = atc_codes[0] if atc_codes else None
rel_resp = requests.get(f"{base_url}/rxcui/{rxcui}/related.json", params={"tty": "IN"}, timeout=5)
rel_resp.raise_for_status()
related = rel_resp.json().get("relatedGroup", {}).get("conceptGroup", [])
if related:
props = related[0].get("conceptProperties", [])
if props:
ingredient_cui = props[0]['rxcui']
if not ingredient_cui:
ingredient_cui = rxcui
break
except requests.exceptions.RequestException:
pass
retries += 1
time.sleep(1)

self.rxcui_atc_cache[drug_name] = [rxcui, atc_code, ingredient_cui]
return ingredient_cui

def get_all_patient_ids(self):
return list(self.patients.keys())

def get_patient_by_id(self, patient_id: str) -> Optional[Dict]:
return self.patients.get(patient_id)

def export_medications(self, output_path: str):
rows = []
for p in self.patients.values():
for med, cui in zip(p["medications"], p["medication_cuis"]):
rows.append({"hadm_id": p["hadm_id"], "medication": med, "ingredient_cui": cui})
pd.DataFrame(rows).to_csv(output_path, index=False)

def export_allergies(self, output_path: str):
rows = []
for p in self.patients.values():
for allergen, cui in zip(p["allergies"], p["allergy_cuis"]):
rows.append({"hadm_id": p["hadm_id"], "allergy": allergen, "allergy_cui": cui})
pd.DataFrame(rows).to_csv(output_path, index=False)

if __name__ == "__main__":
dataset = MedAllergyConflictDataset(root="/your/data/path")
print(f"Total patients: {len(dataset.get_all_patient_ids())}")
21 changes: 21 additions & 0 deletions pyhealth/examples/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
# Medication-Allergy Conflict Detection Task

This module adds a custom PyHealth dataset and task for detecting conflicts between prescribed medications and known patient allergies using MIMIC-IV discharge summaries.

## Features

- Parses free-text discharge notes to extract medications and allergies
- Normalizes terms using RxNorm APIs to get ingredient CUIs
- Detects conflicts based on matching CUIs between meds and allergies
- Provides a PyHealth task interface (`get_label()`, `__call__()`, `export_conflicts()`)

## Files

- `med_allergy_conflict_dataset.py` – loads and processes MIMIC-IV discharge summaries
- `allergy_conflict_task.py` – PyHealth task that detects CUI-level conflicts
- `test_allergy_conflict_task.py` – unit tests for the task and dataset

## How to Run

```bash
python pyhealth/tasks/allergy_conflict_task.py
Binary file added pyhealth/examples/image.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
26 changes: 26 additions & 0 deletions pyhealth/examples/run_med_allergy_example.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
print("🚀 STARTING run_med_allergy_example.py")

import pandas as pd
from pyhealth.tasks.allergy_conflict_task import AllergyConflictDetectionTask
from pyhealth.datasets.med_allergy_conflict_dataset import MedAllergyConflictDataset # Adjust if in a different file

def main():
# Step 1: Set path to the data folder containing 'note/discharge.csv.gz'
root_path = "/Users/royal/Documents/Pyhealth_testing/mimic-iv-note-deidentified-free-text-clinical-notes-2"

# Step 2: Initialize dataset
dataset = MedAllergyConflictDataset(root=root_path)

# Step 3: Run the conflict detection task
task = AllergyConflictDetectionTask(dataset)

# Step 4: Export conflicts to CSV
task.export_conflicts(output_path="conflict_patients.csv")

# Step 5: Preview the output
df = pd.read_csv("conflict_patients.csv")
print("\n🧾 Preview of exported conflicts:")
print(df.head())

if __name__ == "__main__":
main()
2 changes: 2 additions & 0 deletions pyhealth/tasks/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -48,3 +48,5 @@
)
from .sleep_staging_v2 import SleepStagingSleepEDF
from .temple_university_EEG_tasks import EEG_events_fn, EEG_isAbnormal_fn
from .allergy_conflict_task import AllergyConflictDetectionTask

71 changes: 71 additions & 0 deletions pyhealth/tasks/allergy_conflict_task.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,71 @@
# ------------------------------------------------------------------------------
# Author: Chidiebere Anichebe , Naveen Baskaran
# NetID: cpa4@illinois.edu, nc42#illinois.edu
# Contribution: Medication-Allergy Conflict Detection Task
# Paper Title: A Data-Centric Approach to Generate Faithful and High-Quality Patient Summaries with Large Language Models
# Paper Link: https://physionet.org/content/ann-pt-summ/1.0.1/
# Description:
# Labels whether any medication ingredient CUI in a patient admission
# matches a known allergy CUI.
# ------------------------------------------------------------------------------


from pyhealth.tasks.base_task import BaseTask
from typing import Dict, Any
import pandas as pd
from pyhealth.datasets.med_allergy_conflict_dataset import MedAllergyConflictDataset


class AllergyConflictDetectionTask(BaseTask):
"""
A PyHealth Task that detects whether a medication-allergy conflict exists for a given patient.
Returns 1 if any ingredient CUI from medications appears in the allergy CUIs, else 0.
"""

def __init__(self, dataset, **kwargs):
self.dataset = dataset
self.feature_keys = ["medication_cuis", "allergy_cuis"]
self.label_key = "conflict"

def get_label(self, sample: Dict[str, Any]) -> int:
med_cuis = set(str(cui) for cui in sample.get("medication_cuis", []) if cui)
allergy_cuis = set(str(cui) for cui in sample.get("allergy_cuis", []) if cui)
return int(not med_cuis.isdisjoint(allergy_cuis))

def __call__(self, sample: Dict[str, Any]) -> Dict[str, Any]:
label = self.get_label(sample)
return {"label": label}

def export_conflicts(self, output_path: str = "conflict_patients.csv"):
"""
Iterates through the dataset and exports all patients with medication-allergy conflicts.

Args:
output_path (str): Path to save the CSV file with conflict details.
"""
conflict_rows = []
for pid in self.dataset.get_all_patient_ids():
patient = self.dataset.get_patient_by_id(pid)
if not patient:
continue

label = self.get_label(patient)
if label == 1:
conflict_rows.append({
"hadm_id": pid,
"medications": patient.get("medications"),
"medication_cuis": patient.get("medication_cuis"),
"allergies": patient.get("allergies"),
"allergy_cuis": patient.get("allergy_cuis")
})

df = pd.DataFrame(conflict_rows)
df.to_csv(output_path, index=False)
print(f"Exported {len(df)} conflict cases to {output_path}")

if __name__ == "__main__":
from pyhealth.datasets.med_allergy_conflict_dataset import MedAllergyConflictDataset

dataset = MedAllergyConflictDataset(root="/Users/royal/Documents/Pyhealth_testing/mimic-iv-note-deidentified-free-text-clinical-notes-2")
task = AllergyConflictDetectionTask(dataset)
task.export_conflicts("conflict_patients.csv")
Loading