Skip to content

Commit 07f8060

Browse files
committed
add code to create raw tsv and make a markdown table
1 parent c476e4d commit 07f8060

File tree

7 files changed

+274
-3
lines changed

7 files changed

+274
-3
lines changed

mkdocs.yml

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -3,9 +3,9 @@ site_name: CPP Lab datasets
33
repo_url: https://github.com/cpp-lln-lab/Datasets.git
44

55
nav:
6-
- CPP Lab raw data:
7-
- Home: README.md
8-
- Contributing: CONTRIBUTING.md
6+
- Home: README.md
7+
- Contributing: CONTRIBUTING.md
8+
- Raw datasets table: datasets_raw.md
99

1010
theme:
1111
name: material

src/datasets_raw.md

Lines changed: 25 additions & 0 deletions
Large diffs are not rendered by default.
5.62 KB
Binary file not shown.

tools/datasets_raw.tsv

Lines changed: 24 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,24 @@
1+
name nb_subjects has_participant_tsv has_participant_json participant_columns has_phenotype_dir modalities sessions tasks fmriprep freesurfer mriqc
2+
2008_Montreal_BlindRestingState_OC_raw 50 True True ['participant_id', 'group', 'match', 'gender', 'age', 'handedness', 'onset_blindness', 'years_of_blindness', 'total_blindness', 'years_of_total_blindness'] False ['anat', 'func'] [] ['rest'] n/a n/a n/a
3+
2012_Montreal_AudVisMotion_GD_raw 35 False False n/a False ['anat', 'func'] [] ['audMotion', 'visMotion'] n/a n/a n/a
4+
2014_Toronto_CataractsDiffusion_Gao_raw 26 True True ['/annex/objects/MD5E-s436--a3ab24b813254f34bc5773ec7427c33c.tsv'] False ['anat', 'dwi'] ['1'] n/a n/a n/a n/a
5+
2015_Trento_Diffusion_SM_raw 30 True True ['participant_id', 'group', 'group_counter', 'initials', 'recording'] False ['anat', 'dwi', 'func'] [] ['unknown'] n/a n/a n/a
6+
2015_Trento_MultimConn_SM_raw 29 True True ['/annex/objects/MD5E-s567--adb7ca1aa64cfe8680a842000ec06a43.tsv'] False ['anat', 'dwi', 'func'] ['01'] ['Catgs'] n/a n/a n/a
7+
2015_Trento_categs_SM_raw 53 False False n/a False ['anat', 'dwi', 'func'] [] ['Catgs'] n/a n/a n/a
8+
2016_Trento_Categs_MR_raw 29 False False n/a False ['anat', 'dwi', 'func'] ['01'] ['Catgs'] n/a n/a n/a
9+
2016_Trento_MultimodalMotion_MR_raw 23 False False n/a False ['anat', 'func'] ['01'] ['audMotion', 'motDecoding', 'visMotion'] n/a n/a n/a
10+
2016_Trento_SpatiotopyDir_CB_raw 33 True True ['/annex/objects/MD5E-s724--cfd24d6ce2366dd29831a737b3f48752.tsv'] False ['anat', 'func'] [] ['pRF'] n/a n/a n/a
11+
2018_LLN_FPAS_FB_raw 16 False False n/a False ['eeg'] [] ['FPAS'] n/a n/a n/a
12+
2018_Toronto_Cataract_SM_raw 46 False False n/a False ['anat', 'func'] [] ['categ', 'categBlur1', 'categBlur2'] n/a n/a n/a
13+
2019_Nancy_iEEG_FB_raw 1 True False ['participant_id'] False ['ieeg'] ['1', '2', '3', '4'] ['ERPaudmot', 'ERPbraille', 'ERPcategory', 'FPSaudmot', 'FPSemotion', 'FPSface', 'FPSlexical', 'FPSvismot', 'FPSvoice', 'Readingbraille', 'Restingstate', 'Speechtracking', 'Stimulation'] n/a n/a n/a
14+
2021_Dijon_BabyFPAS_RPC_raw 23 False False n/a False ['eeg'] [] ['FPAS'] n/a n/a n/a
15+
2021_LLN_FVDE_FB_raw 20 False False n/a False ['eeg'] ['001'] ['fvde'] n/a n/a n/a
16+
2021_LLN_emotion_ST_raw 14 False False n/a False ['eeg'] [] ['emotion'] n/a n/a n/a
17+
2021_SaintLuc_TmsMT_FB_raw 16 True True ['participant_id'] False ['anat', 'beh', 'func'] ['mri', 'sham', 'tms'] ['audInstrumentDiscrimination', 'audMotionDirDiscrimination', 'auditoryLocalizer', 'visColourDiscrimination', 'visMotionDirDiscrimination', 'visualLocalizer'] n/a n/a n/a
18+
2021_SaintLuc_combiEmo_FF_raw 24 True True ['participant_id', 'education', 'bmi'] False ['anat', 'func'] ['01', '02'] ['correction', 'eventrelatedCombiemoAuditory', 'eventrelatedCombiemoBimdal', 'eventrelatedCombiemoBimodal', 'eventrelatedCombiemoVisual', 'facelocalizerCombiemo', 'facelocalizerCombiemoCombiemo', 'voicelocalizer', 'voicelocalizerCombiemo'] n/a n/a n/a
19+
2022_SaintLuc_lipSpeech_AA_raw 2 False False n/a False ['anat', 'func'] ['01', '02', '03'] ['MVPAAud', 'MVPAVis', 'PhonoLoc', 'VisLoc'] n/a n/a n/a
20+
2023_Liege_BLAM_MB_raw 14 True True ['participant_id', 'age', 'sex', 'group'] False ['anat', 'func'] ['01', '02', '03', '04'] ['BimodalMotionAud', 'BimodalMotionVis', 'audioV1', 'auditoryLocalizer', 'bimodalMotionAud', 'bimodalMotionVis', 'mtMstLocalizer', 'rdkBimodalMotion', 'restingState', 'visualLocalizer'] n/a n/a n/a
21+
2023_SaintLuc_VisTacMotionFoR_IS_raw 21 True True ['participant_id', 'codename', 'date', 'age', 'gender'] False ['anat', 'func'] ['001', '002'] ['handDown', 'handUp', 'mtMstLocalizer', 'tactileLocalizer2', 'visual', 'visualLocalizer2'] n/a n/a n/a
22+
2023_Trento_plosBiology_YX_raw 48 True False ['participant_id'] False ['anat', 'fmap', 'func'] [] ['judgement', 'resting'] n/a n/a n/a
23+
Toronto_VisMotionLocalizer_MR_raw 3 True False ['participant_id'] False ['anat', 'func'] ['01'] ['visMotion'] n/a n/a n/a
24+
olf_blind_raw 35 True True ['participant_id', 'Group', 'Sex', 'Age', 'Educational level', 'Smoker', 'Medication', 'Vision level', 'Use of guide dog', 'Use of white cane', 'Musical practice', 'Braille reading', 'Braille reading hand used', 'Age of total blindness onset', 'Blindness Reason', 'Handedness', 'DK_C1_Letter_Fluency', 'DK_C2_Category_Fluency', 'DK_C3_Category_SwitchingTC', 'DK_C3_Category_SwitchingACC', 'TEA_C2', 'TEA_C3', 'CVLT_T1_T5', 'CVLT_ImmediateFreeRecall', 'CVLT_ImmediateCuedRecall', 'CVLT_DelayedFreeRecall', 'CVLT_DelayedCuedRecall', 'CVLT_Recognition', 'SS_Iden_O1', 'SS_Iden_O2', 'SS_Iden_O3', 'SS_Iden_O4', 'SS_Iden_O5', 'SS_Iden_O6', 'SS_Iden_O7', 'SS_Iden_O8', 'SS_Iden_O9', 'SS_Iden_O10', 'SS_Iden_O11', 'SS_Iden_O12', 'SS_Iden_O13', 'SS_Iden_O14', 'SS_Iden_O15', 'SS_Iden_O16', 'SS_Pls_O1', 'SS_Pls_O2', 'SS_Pls_O3', 'SS_Pls_O4', 'SS_Pls_O5', 'SS_Pls_O6', 'SS_Pls_O7', 'SS_Pls_O8', 'SS_Pls_O9', 'SS_Pls_O10', 'SS_Pls_O11', 'SS_Pls_O12', 'SS_Pls_O13', 'SS_Pls_O14', 'SS_Pls_O15', 'SS_Pls_O16', 'SS_Int_O1', 'SS_Int_O2', 'SS_Int_O3', 'SS_Int_O4', 'SS_Int_O5', 'SS_Int_O6', 'SS_Int_O7', 'SS_Int_O8', 'SS_Int_O9', 'SS_Int_O10', 'SS_Int_O11', 'SS_Int_O12', 'SS_Int_O13', 'SS_Int_O14', 'SS_Int_O15', 'SS_Int_O16', 'SS_Threshold_rightN', 'SS_Threshold_leftN', 'SS_Discrimination_Total', 'SS_OM_Total', 'SS_OM_Hits', 'SS_OM_FalseAlarms', 'SS_OM_Miss', 'SS_OM_Correct_Rejection', 'SS_OM_O1', 'SS_OM_O2', 'SS_OM_O3', 'SS_OM_O4', 'SS_OM_O5', 'SS_OM_O6', 'SS_OM_O7', 'SS_OM_O8', 'SS_OM_O9', 'SS_OM_O10', 'SS_OM_O11', 'SS_OM_O12', 'SS_OM_O13', 'SS_OM_O14', 'SS_OM_O15', 'SS_OM_O16', 'IRM_Pleasantness_Eucalyptus', 'IRM_Pleasantness_Almond', 'IRM_Intensity_Eucalyptus', 'IRM_Intensity_Almond'] False ['anat', 'dwi', 'func'] [] ['olfid', 'olfloc', 'rest'] n/a n/a n/a

tools/list_raw.py

Lines changed: 38 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,38 @@
1+
"""List datasets contents on cpp-lln-lab_raw and write the results in a tsv file.
2+
3+
to do:
4+
5+
- [ ] Also checks for derivatives folders for mriqc, frmiprep and freesurfer.
6+
"""
7+
8+
from pathlib import Path
9+
10+
import pandas as pd
11+
12+
from utils import init_dataset
13+
from utils import list_datasets_in_dir
14+
15+
cpp_raw = Path(__file__).parent.parent / 'cpp-lln-lab_raw'
16+
17+
18+
# Overwrite the tsv file with the current raw datasets
19+
20+
DEBUG = False
21+
22+
datasets = init_dataset()
23+
input_dir = cpp_raw
24+
datasets = list_datasets_in_dir(datasets, input_dir, debug=DEBUG)
25+
26+
datasets_df = pd.DataFrame.from_dict(datasets)
27+
28+
datasets_df = datasets_df.sort_values("name")
29+
30+
root_dir = Path(__file__).parent.parent
31+
32+
output_file = Path(__file__).parent / 'datasets_raw.tsv'
33+
34+
datasets_df.to_csv(output_file, index=False, sep="\t")
35+
36+
mk_file = Path(__file__).parent.parent / 'src/datasets_raw.md'
37+
38+
pd.read_csv(output_file, sep = "\t").to_markdown(mk_file, index=False, mode="a")

tools/print_dataset_listing.py

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,15 @@
1+
"""Take the listing of raw datasets
2+
and turns it into a markdown document with a series of markdown tables."""
3+
4+
from pathlib import Path
5+
import pandas as pd
6+
from bids import BIDSLayout
7+
8+
column_order = [
9+
"name",
10+
"description",
11+
"datatypes",
12+
"suffixes",
13+
"link to full data",
14+
"maintained by",
15+
]

tools/utils.py

Lines changed: 169 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,169 @@
1+
2+
"""Utility functions for tools."""
3+
4+
from typing import Any
5+
import pandas as pd
6+
from pathlib import Path
7+
from warnings import warn
8+
9+
10+
def new_dataset(name: str) -> dict[str, str | int | bool | list[str]]:
11+
return {
12+
"name": name,
13+
"nb_subjects": "n/a",
14+
"has_participant_tsv": "n/a",
15+
"has_participant_json": "n/a",
16+
"participant_columns": "n/a",
17+
"has_phenotype_dir": "n/a",
18+
"modalities": "n/a",
19+
"tasks": "n/a",
20+
# "raw": f"{URL_GIN}{name}",
21+
"fmriprep": "n/a",
22+
"freesurfer": "n/a",
23+
"mriqc": "n/a",
24+
}
25+
26+
def init_dataset() -> dict[str, list[Any]]:
27+
return {
28+
"name": [],
29+
"nb_subjects": [], # usually the number of subjects folder in raw dataset
30+
"has_participant_tsv": [],
31+
"has_participant_json": [],
32+
"participant_columns": [],
33+
"has_phenotype_dir": [],
34+
"modalities": [],
35+
"sessions": [], # list of sessions if exist
36+
"tasks": [],
37+
# "raw": [], # link to raw dataset
38+
"fmriprep": [], # link to fmriprep dataset if exists
39+
"freesurfer": [], # link to freesurfer dataset if exists
40+
"mriqc": [], # link to mriqc dataset if exists
41+
}
42+
43+
def is_known_bids_modality(modality: str) -> bool:
44+
KNOWN_MODALITIES = [
45+
"anat",
46+
"dwi",
47+
"func",
48+
"perf",
49+
"fmap",
50+
"beh",
51+
"meg",
52+
"eeg",
53+
"ieeg",
54+
"pet",
55+
"micr",
56+
"nirs",
57+
"motion",
58+
]
59+
return modality in KNOWN_MODALITIES
60+
61+
def list_modalities(bids_pth: Path, sessions: list[str]) -> list[str]:
62+
pattern = "sub-*/ses-*/*" if sessions else "sub-*/*"
63+
sub_dirs = [v.name for v in bids_pth.glob(pattern) if v.is_dir()]
64+
modalities = [v for v in set(sub_dirs) if is_known_bids_modality(v)]
65+
return list(set(modalities))
66+
67+
def list_data_files(bids_pth: Path, sessions: list[str]) -> list[str]:
68+
"""Return the list of files in BIDS raw."""
69+
pattern = "sub-*/ses-*/*/*" if sessions else "sub-*/*/*"
70+
files = [v.name for v in bids_pth.glob(pattern) if "task-" in v.name]
71+
return files
72+
73+
def list_tasks(bids_pth: Path, sessions: list[str]) -> list[str]:
74+
files = list_data_files(bids_pth, sessions)
75+
tasks = [f.split("task-")[1].split("_")[0] for f in files]
76+
tasks = list(set(tasks))
77+
return tasks
78+
79+
def get_nb_subjects(pth: Path) -> int:
80+
return len(list_participants_in_dataset(pth))
81+
82+
def has_participant_tsv(pth: Path) -> tuple[bool, bool, str | list[str]]:
83+
tsv_status = bool((pth / "participants.tsv").exists())
84+
json_status = bool((pth / "participants.json").exists())
85+
if tsv_status:
86+
return tsv_status, json_status, list_participants_tsv_columns(pth / "participants.tsv")
87+
else:
88+
return tsv_status, json_status, "n/a"
89+
90+
def list_participants_tsv_columns(participant_tsv: Path) -> list[str]:
91+
"""Return the list of columns in participants.tsv."""
92+
try:
93+
df = pd.read_csv(participant_tsv, sep="\t")
94+
return df.columns.tolist()
95+
except pd.errors.ParserError:
96+
warn(f"Could not parse: {participant_tsv}")
97+
return ["cannot be parsed"]
98+
99+
def list_datasets_in_dir(
100+
datasets: dict[str, list[Any]], path: Path, debug: bool
101+
) -> dict[str, list[Any]]:
102+
print(f"Listing datasets in {path}")
103+
104+
raw_datasets = sorted(list(path.glob("*raw")))
105+
106+
# derivatives = known_derivatives()
107+
108+
for i, dataset_pth in enumerate(raw_datasets):
109+
if debug and i > 10:
110+
break
111+
112+
dataset_name = dataset_pth.name
113+
print(f" {dataset_name}")
114+
115+
dataset = new_dataset(dataset_name)
116+
dataset["nb_subjects"] = get_nb_subjects(dataset_pth)
117+
118+
if dataset["nb_subjects"] == 0:
119+
continue
120+
121+
sessions = list_sessions(dataset_pth)
122+
dataset["sessions"] = sessions
123+
124+
modalities = list_modalities(dataset_pth, sessions=sessions)
125+
if any(
126+
mod in modalities
127+
for mod in ["func", "eeg", "ieeg", "meg", "beh", "perf", "pet", "motion"]
128+
):
129+
tasks = list_tasks(dataset_pth, sessions=sessions)
130+
check_task(tasks, modalities, sessions, dataset_pth)
131+
dataset["tasks"] = sorted(tasks)
132+
dataset["modalities"] = sorted(modalities)
133+
134+
tsv_status, json_status, columns = has_participant_tsv(dataset_pth)
135+
dataset["has_participant_tsv"] = tsv_status
136+
dataset["has_participant_json"] = json_status
137+
dataset["participant_columns"] = columns
138+
dataset["has_phenotype_dir"] = bool((dataset_pth / "phenotype").exists())
139+
140+
# dataset = add_derivatives(dataset, dataset_pth, derivatives)
141+
142+
if dataset["name"] in datasets["name"]:
143+
raise ValueError(f"dataset {dataset['name']} already in datasets")
144+
145+
for keys in datasets:
146+
datasets[keys].append(dataset[keys])
147+
148+
return datasets
149+
150+
def list_sessions(dataset_pth: Path) -> list[str]:
151+
sessions = [v.name.replace("ses-", "") for v in dataset_pth.glob("sub-*/ses-*") if v.is_dir()]
152+
return sorted(list(set(sessions)))
153+
154+
def check_task(
155+
tasks: list[str], modalities: list[str], sessions: list[str], dataset_pth: Path
156+
) -> None:
157+
"""Check if tasks are present in dataset with modalities that can have tasks."""
158+
if (
159+
any(mod in modalities for mod in ["func", "eeg", "ieeg", "meg", "beh", "motion"])
160+
and not tasks
161+
):
162+
warn(
163+
f"no tasks found in {dataset_pth} "
164+
f"with modalities {modalities} "
165+
f"and files {list_data_files(dataset_pth, sessions)}"
166+
)
167+
168+
def list_participants_in_dataset(data_pth: Path) -> list[str]:
169+
return [x.name for x in data_pth.iterdir() if x.is_dir() and x.name.startswith("sub-")]

0 commit comments

Comments
 (0)