add code to create raw tsv and make a markdown table

marcobarilari · marcobarilari · commit 07f8060ebf22 · 2023-10-20T17:40:54.000+02:00
diff --git a/mkdocs.yml b/mkdocs.yml
@@ -3,9 +3,9 @@ site_name: CPP Lab datasets
 repo_url: https://github.com/cpp-lln-lab/Datasets.git
 
 nav:
-- CPP Lab raw data:
-  - Home: README.md
-  - Contributing: CONTRIBUTING.md
+- Home: README.md
+- Contributing: CONTRIBUTING.md
+- Raw datasets table: datasets_raw.md
 
 theme:
   name: material
diff --git a/src/datasets_raw.md b/src/datasets_raw.md
diff --git a/tools/__pycache__/utils.cpython-310.pyc b/tools/__pycache__/utils.cpython-310.pyc
diff --git a/tools/datasets_raw.tsv b/tools/datasets_raw.tsv
@@ -0,0 +1,24 @@
+name	nb_subjects	has_participant_tsv	has_participant_json	participant_columns	has_phenotype_dir	modalities	sessions	tasks	fmriprep	freesurfer	mriqc
+2008_Montreal_BlindRestingState_OC_raw	50	True	True	['participant_id', 'group', 'match', 'gender', 'age', 'handedness', 'onset_blindness', 'years_of_blindness', 'total_blindness', 'years_of_total_blindness']	False	['anat', 'func']	[]	['rest']	n/a	n/a	n/a
+2012_Montreal_AudVisMotion_GD_raw	35	False	False	n/a	False	['anat', 'func']	[]	['audMotion', 'visMotion']	n/a	n/a	n/a
+2014_Toronto_CataractsDiffusion_Gao_raw	26	True	True	['/annex/objects/MD5E-s436--a3ab24b813254f34bc5773ec7427c33c.tsv']	False	['anat', 'dwi']	['1']	n/a	n/a	n/a	n/a
+2015_Trento_Diffusion_SM_raw	30	True	True	['participant_id', 'group', 'group_counter', 'initials', 'recording']	False	['anat', 'dwi', 'func']	[]	['unknown']	n/a	n/a	n/a
+2015_Trento_MultimConn_SM_raw	29	True	True	['/annex/objects/MD5E-s567--adb7ca1aa64cfe8680a842000ec06a43.tsv']	False	['anat', 'dwi', 'func']	['01']	['Catgs']	n/a	n/a	n/a
+2015_Trento_categs_SM_raw	53	False	False	n/a	False	['anat', 'dwi', 'func']	[]	['Catgs']	n/a	n/a	n/a
+2016_Trento_Categs_MR_raw	29	False	False	n/a	False	['anat', 'dwi', 'func']	['01']	['Catgs']	n/a	n/a	n/a
+2016_Trento_MultimodalMotion_MR_raw	23	False	False	n/a	False	['anat', 'func']	['01']	['audMotion', 'motDecoding', 'visMotion']	n/a	n/a	n/a
+2016_Trento_SpatiotopyDir_CB_raw	33	True	True	['/annex/objects/MD5E-s724--cfd24d6ce2366dd29831a737b3f48752.tsv']	False	['anat', 'func']	[]	['pRF']	n/a	n/a	n/a
+2018_LLN_FPAS_FB_raw	16	False	False	n/a	False	['eeg']	[]	['FPAS']	n/a	n/a	n/a
+2018_Toronto_Cataract_SM_raw	46	False	False	n/a	False	['anat', 'func']	[]	['categ', 'categBlur1', 'categBlur2']	n/a	n/a	n/a
+2019_Nancy_iEEG_FB_raw	1	True	False	['participant_id']	False	['ieeg']	['1', '2', '3', '4']	['ERPaudmot', 'ERPbraille', 'ERPcategory', 'FPSaudmot', 'FPSemotion', 'FPSface', 'FPSlexical', 'FPSvismot', 'FPSvoice', 'Readingbraille', 'Restingstate', 'Speechtracking', 'Stimulation']	n/a	n/a	n/a
+2021_Dijon_BabyFPAS_RPC_raw	23	False	False	n/a	False	['eeg']	[]	['FPAS']	n/a	n/a	n/a
+2021_LLN_FVDE_FB_raw	20	False	False	n/a	False	['eeg']	['001']	['fvde']	n/a	n/a	n/a
+2021_LLN_emotion_ST_raw	14	False	False	n/a	False	['eeg']	[]	['emotion']	n/a	n/a	n/a
+2021_SaintLuc_TmsMT_FB_raw	16	True	True	['participant_id']	False	['anat', 'beh', 'func']	['mri', 'sham', 'tms']	['audInstrumentDiscrimination', 'audMotionDirDiscrimination', 'auditoryLocalizer', 'visColourDiscrimination', 'visMotionDirDiscrimination', 'visualLocalizer']	n/a	n/a	n/a
+2021_SaintLuc_combiEmo_FF_raw	24	True	True	['participant_id', 'education', 'bmi']	False	['anat', 'func']	['01', '02']	['correction', 'eventrelatedCombiemoAuditory', 'eventrelatedCombiemoBimdal', 'eventrelatedCombiemoBimodal', 'eventrelatedCombiemoVisual', 'facelocalizerCombiemo', 'facelocalizerCombiemoCombiemo', 'voicelocalizer', 'voicelocalizerCombiemo']	n/a	n/a	n/a
+2022_SaintLuc_lipSpeech_AA_raw	2	False	False	n/a	False	['anat', 'func']	['01', '02', '03']	['MVPAAud', 'MVPAVis', 'PhonoLoc', 'VisLoc']	n/a	n/a	n/a
+2023_Liege_BLAM_MB_raw	14	True	True	['participant_id', 'age', 'sex', 'group']	False	['anat', 'func']	['01', '02', '03', '04']	['BimodalMotionAud', 'BimodalMotionVis', 'audioV1', 'auditoryLocalizer', 'bimodalMotionAud', 'bimodalMotionVis', 'mtMstLocalizer', 'rdkBimodalMotion', 'restingState', 'visualLocalizer']	n/a	n/a	n/a
+2023_SaintLuc_VisTacMotionFoR_IS_raw	21	True	True	['participant_id', 'codename', 'date', 'age', 'gender']	False	['anat', 'func']	['001', '002']	['handDown', 'handUp', 'mtMstLocalizer', 'tactileLocalizer2', 'visual', 'visualLocalizer2']	n/a	n/a	n/a
+2023_Trento_plosBiology_YX_raw	48	True	False	['participant_id']	False	['anat', 'fmap', 'func']	[]	['judgement', 'resting']	n/a	n/a	n/a
+Toronto_VisMotionLocalizer_MR_raw	3	True	False	['participant_id']	False	['anat', 'func']	['01']	['visMotion']	n/a	n/a	n/a
+olf_blind_raw	35	True	True	['participant_id', 'Group', 'Sex', 'Age', 'Educational level', 'Smoker', 'Medication', 'Vision level', 'Use of guide dog', 'Use of white cane', 'Musical practice', 'Braille reading', 'Braille reading hand used', 'Age of total blindness onset', 'Blindness Reason', 'Handedness', 'DK_C1_Letter_Fluency', 'DK_C2_Category_Fluency', 'DK_C3_Category_SwitchingTC', 'DK_C3_Category_SwitchingACC', 'TEA_C2', 'TEA_C3', 'CVLT_T1_T5', 'CVLT_ImmediateFreeRecall', 'CVLT_ImmediateCuedRecall', 'CVLT_DelayedFreeRecall', 'CVLT_DelayedCuedRecall', 'CVLT_Recognition', 'SS_Iden_O1', 'SS_Iden_O2', 'SS_Iden_O3', 'SS_Iden_O4', 'SS_Iden_O5', 'SS_Iden_O6', 'SS_Iden_O7', 'SS_Iden_O8', 'SS_Iden_O9', 'SS_Iden_O10', 'SS_Iden_O11', 'SS_Iden_O12', 'SS_Iden_O13', 'SS_Iden_O14', 'SS_Iden_O15', 'SS_Iden_O16', 'SS_Pls_O1', 'SS_Pls_O2', 'SS_Pls_O3', 'SS_Pls_O4', 'SS_Pls_O5', 'SS_Pls_O6', 'SS_Pls_O7', 'SS_Pls_O8', 'SS_Pls_O9', 'SS_Pls_O10', 'SS_Pls_O11', 'SS_Pls_O12', 'SS_Pls_O13', 'SS_Pls_O14', 'SS_Pls_O15', 'SS_Pls_O16', 'SS_Int_O1', 'SS_Int_O2', 'SS_Int_O3', 'SS_Int_O4', 'SS_Int_O5', 'SS_Int_O6', 'SS_Int_O7', 'SS_Int_O8', 'SS_Int_O9', 'SS_Int_O10', 'SS_Int_O11', 'SS_Int_O12', 'SS_Int_O13', 'SS_Int_O14', 'SS_Int_O15', 'SS_Int_O16', 'SS_Threshold_rightN', 'SS_Threshold_leftN', 'SS_Discrimination_Total', 'SS_OM_Total', 'SS_OM_Hits', 'SS_OM_FalseAlarms', 'SS_OM_Miss', 'SS_OM_Correct_Rejection', 'SS_OM_O1', 'SS_OM_O2', 'SS_OM_O3', 'SS_OM_O4', 'SS_OM_O5', 'SS_OM_O6', 'SS_OM_O7', 'SS_OM_O8', 'SS_OM_O9', 'SS_OM_O10', 'SS_OM_O11', 'SS_OM_O12', 'SS_OM_O13', 'SS_OM_O14', 'SS_OM_O15', 'SS_OM_O16', 'IRM_Pleasantness_Eucalyptus', 'IRM_Pleasantness_Almond', 'IRM_Intensity_Eucalyptus', 'IRM_Intensity_Almond']	False	['anat', 'dwi', 'func']	[]	['olfid', 'olfloc', 'rest']	n/a	n/a	n/a
diff --git a/tools/list_raw.py b/tools/list_raw.py
@@ -0,0 +1,38 @@
+"""List datasets contents on cpp-lln-lab_raw and write the results in a tsv file.
+
+to do:
+
+- [ ] Also checks for derivatives folders for mriqc, frmiprep and freesurfer.
+"""
+
+from pathlib import Path
+
+import pandas as pd
+
+from utils import init_dataset
+from utils import list_datasets_in_dir
+
+cpp_raw = Path(__file__).parent.parent / 'cpp-lln-lab_raw'
+
+
+# Overwrite the tsv file with the current raw datasets
+
+DEBUG = False
+
+datasets = init_dataset()
+input_dir = cpp_raw
+datasets = list_datasets_in_dir(datasets, input_dir, debug=DEBUG)
+
+datasets_df = pd.DataFrame.from_dict(datasets)
+
+datasets_df = datasets_df.sort_values("name")
+
+root_dir = Path(__file__).parent.parent
+
+output_file = Path(__file__).parent / 'datasets_raw.tsv'
+
+datasets_df.to_csv(output_file, index=False, sep="\t")
+
+mk_file = Path(__file__).parent.parent / 'src/datasets_raw.md'
+
+pd.read_csv(output_file, sep = "\t").to_markdown(mk_file, index=False, mode="a")
diff --git a/tools/print_dataset_listing.py b/tools/print_dataset_listing.py
@@ -0,0 +1,15 @@
+"""Take the listing of raw datasets
+and turns it into a markdown document with a series of markdown tables."""
+
+from pathlib import Path
+import pandas as pd
+from bids import BIDSLayout
+
+column_order = [
+    "name",
+    "description",
+    "datatypes",
+    "suffixes",
+    "link to full data",
+    "maintained by",
+]
diff --git a/tools/utils.py b/tools/utils.py
@@ -0,0 +1,169 @@
+
+"""Utility functions for tools."""
+
+from typing import Any
+import pandas as pd
+from pathlib import Path
+from warnings import warn
+
+
+def new_dataset(name: str) -> dict[str, str | int | bool | list[str]]:
+    return {
+        "name": name,
+        "nb_subjects": "n/a",
+        "has_participant_tsv": "n/a",
+        "has_participant_json": "n/a",
+        "participant_columns": "n/a",
+        "has_phenotype_dir": "n/a",
+        "modalities": "n/a",
+        "tasks": "n/a",
+        # "raw": f"{URL_GIN}{name}",
+        "fmriprep": "n/a",
+        "freesurfer": "n/a",
+        "mriqc": "n/a",
+    }
+
+def init_dataset() -> dict[str, list[Any]]:
+    return {
+        "name": [],
+        "nb_subjects": [],  # usually the number of subjects folder in raw dataset
+        "has_participant_tsv": [],
+        "has_participant_json": [],
+        "participant_columns": [],
+        "has_phenotype_dir": [],
+        "modalities": [],
+        "sessions": [],  # list of sessions if exist
+        "tasks": [],
+        # "raw": [],  # link to raw dataset
+        "fmriprep": [],  # link to fmriprep dataset if exists
+        "freesurfer": [],  # link to freesurfer dataset if exists
+        "mriqc": [],  # link to mriqc dataset if exists
+    }
+    
+def is_known_bids_modality(modality: str) -> bool:
+    KNOWN_MODALITIES = [
+        "anat",
+        "dwi",
+        "func",
+        "perf",
+        "fmap",
+        "beh",
+        "meg",
+        "eeg",
+        "ieeg",
+        "pet",
+        "micr",
+        "nirs",
+        "motion",
+    ]
+    return modality in KNOWN_MODALITIES
+    
+def list_modalities(bids_pth: Path, sessions: list[str]) -> list[str]:
+    pattern = "sub-*/ses-*/*" if sessions else "sub-*/*"
+    sub_dirs = [v.name for v in bids_pth.glob(pattern) if v.is_dir()]
+    modalities = [v for v in set(sub_dirs) if is_known_bids_modality(v)]
+    return list(set(modalities))
+
+def list_data_files(bids_pth: Path, sessions: list[str]) -> list[str]:
+    """Return the list of files in BIDS raw."""
+    pattern = "sub-*/ses-*/*/*" if sessions else "sub-*/*/*"
+    files = [v.name for v in bids_pth.glob(pattern) if "task-" in v.name]
+    return files
+
+def list_tasks(bids_pth: Path, sessions: list[str]) -> list[str]:
+    files = list_data_files(bids_pth, sessions)
+    tasks = [f.split("task-")[1].split("_")[0] for f in files]
+    tasks = list(set(tasks))
+    return tasks
+
+def get_nb_subjects(pth: Path) -> int:
+    return len(list_participants_in_dataset(pth))
+
+def has_participant_tsv(pth: Path) -> tuple[bool, bool, str | list[str]]:
+    tsv_status = bool((pth / "participants.tsv").exists())
+    json_status = bool((pth / "participants.json").exists())
+    if tsv_status:
+        return tsv_status, json_status, list_participants_tsv_columns(pth / "participants.tsv")
+    else:
+        return tsv_status, json_status, "n/a"
+    
+def list_participants_tsv_columns(participant_tsv: Path) -> list[str]:
+    """Return the list of columns in participants.tsv."""
+    try:
+        df = pd.read_csv(participant_tsv, sep="\t")
+        return df.columns.tolist()
+    except pd.errors.ParserError:
+        warn(f"Could not parse: {participant_tsv}")
+        return ["cannot be parsed"]
+
+def list_datasets_in_dir(
+    datasets: dict[str, list[Any]], path: Path, debug: bool
+) -> dict[str, list[Any]]:
+    print(f"Listing datasets in {path}")
+
+    raw_datasets = sorted(list(path.glob("*raw")))
+
+    # derivatives = known_derivatives()
+
+    for i, dataset_pth in enumerate(raw_datasets):
+        if debug and i > 10:
+            break
+
+        dataset_name = dataset_pth.name
+        print(f" {dataset_name}")
+
+        dataset = new_dataset(dataset_name)
+        dataset["nb_subjects"] = get_nb_subjects(dataset_pth)
+
+        if dataset["nb_subjects"] == 0:
+            continue
+
+        sessions = list_sessions(dataset_pth)
+        dataset["sessions"] = sessions
+
+        modalities = list_modalities(dataset_pth, sessions=sessions)
+        if any(
+            mod in modalities
+            for mod in ["func", "eeg", "ieeg", "meg", "beh", "perf", "pet", "motion"]
+        ):
+            tasks = list_tasks(dataset_pth, sessions=sessions)
+            check_task(tasks, modalities, sessions, dataset_pth)
+            dataset["tasks"] = sorted(tasks)
+        dataset["modalities"] = sorted(modalities)
+
+        tsv_status, json_status, columns = has_participant_tsv(dataset_pth)
+        dataset["has_participant_tsv"] = tsv_status
+        dataset["has_participant_json"] = json_status
+        dataset["participant_columns"] = columns
+        dataset["has_phenotype_dir"] = bool((dataset_pth / "phenotype").exists())
+
+        # dataset = add_derivatives(dataset, dataset_pth, derivatives)
+
+        if dataset["name"] in datasets["name"]:
+            raise ValueError(f"dataset {dataset['name']} already in datasets")
+
+        for keys in datasets:
+            datasets[keys].append(dataset[keys])
+
+    return datasets
+
+def list_sessions(dataset_pth: Path) -> list[str]:
+    sessions = [v.name.replace("ses-", "") for v in dataset_pth.glob("sub-*/ses-*") if v.is_dir()]
+    return sorted(list(set(sessions)))
+
+def check_task(
+    tasks: list[str], modalities: list[str], sessions: list[str], dataset_pth: Path
+) -> None:
+    """Check if tasks are present in dataset with modalities that can have tasks."""
+    if (
+        any(mod in modalities for mod in ["func", "eeg", "ieeg", "meg", "beh", "motion"])
+        and not tasks
+    ):
+        warn(
+            f"no tasks found in {dataset_pth} "
+            f"with modalities {modalities} "
+            f"and files {list_data_files(dataset_pth, sessions)}"
+        )
+        
+def list_participants_in_dataset(data_pth: Path) -> list[str]:
+    return [x.name for x in data_pth.iterdir() if x.is_dir() and x.name.startswith("sub-")]