Skip to content

Commit 3c8fb67

Browse files
wyliYu0610
authored andcommitted
6627 reading dcm folder filtering filenames (Project-MONAI#7181)
Fixes Project-MONAI#6627 ### Description adding a `fname_regex` option to the pydicom reader ### Types of changes <!--- Put an `x` in all the boxes that apply, and remove the not applicable items --> - [x] Non-breaking change (fix or new feature that would not break existing functionality). - [ ] Breaking change (fix or new feature that would cause existing functionality to change). - [ ] New tests added to cover the changes. - [ ] Integration tests passed locally by running `./runtests.sh -f -u --net --coverage`. - [x] Quick tests passed locally by running `./runtests.sh --quick --unittests --disttests`. - [x] In-line docstrings updated. - [ ] Documentation updated, tested `make html` command in the `docs/` folder. --------- Signed-off-by: Wenqi Li <wenqil@nvidia.com> Signed-off-by: Yu0610 <612410030@alum.ccu.edu.tw>
1 parent 00bb5d5 commit 3c8fb67

File tree

6 files changed

+71
-11
lines changed

6 files changed

+71
-11
lines changed

monai/apps/datasets.py

Lines changed: 8 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,7 @@
2222
import numpy as np
2323

2424
from monai.apps.tcia import (
25+
DCM_FILENAME_REGEX,
2526
download_tcia_series_instance,
2627
get_tcia_metadata,
2728
get_tcia_ref_uid,
@@ -442,6 +443,10 @@ class TciaDataset(Randomizable, CacheDataset):
442443
specific_tags: tags that will be loaded for "SEG" series. This argument will be used in
443444
`monai.data.PydicomReader`. Default is [(0x0008, 0x1115), (0x0008,0x1140), (0x3006, 0x0010),
444445
(0x0020,0x000D), (0x0010,0x0010), (0x0010,0x0020), (0x0020,0x0011), (0x0020,0x0012)].
446+
fname_regex: a regular expression to match the file names when the input is a folder.
447+
If provided, only the matched files will be included. For example, to include the file name
448+
"image_0001.dcm", the regular expression could be `".*image_(\\d+).dcm"`.
449+
Default to `"^(?!.*LICENSE).*"`, ignoring any file name containing `"LICENSE"`.
445450
val_frac: percentage of validation fraction in the whole dataset, default is 0.2.
446451
seed: random seed to randomly shuffle the datalist before splitting into training and validation, default is 0.
447452
note to set same seed for `training` and `validation` sections.
@@ -509,6 +514,7 @@ def __init__(
509514
(0x0020, 0x0011), # Series Number
510515
(0x0020, 0x0012), # Acquisition Number
511516
),
517+
fname_regex: str = DCM_FILENAME_REGEX,
512518
seed: int = 0,
513519
val_frac: float = 0.2,
514520
cache_num: int = sys.maxsize,
@@ -548,12 +554,13 @@ def __init__(
548554

549555
if not os.path.exists(download_dir):
550556
raise RuntimeError(f"Cannot find dataset directory: {download_dir}.")
557+
self.fname_regex = fname_regex
551558

552559
self.indices: np.ndarray = np.array([])
553560
self.datalist = self._generate_data_list(download_dir)
554561

555562
if transform == ():
556-
transform = LoadImaged(reader="PydicomReader", keys=["image"])
563+
transform = LoadImaged(keys=["image"], reader="PydicomReader", fname_regex=self.fname_regex)
557564
CacheDataset.__init__(
558565
self,
559566
data=self.datalist,

monai/apps/tcia/__init__.py

Lines changed: 8 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -12,4 +12,11 @@
1212
from __future__ import annotations
1313

1414
from .label_desc import TCIA_LABEL_DICT
15-
from .utils import download_tcia_series_instance, get_tcia_metadata, get_tcia_ref_uid, match_tcia_ref_uid_in_study
15+
from .utils import (
16+
BASE_URL,
17+
DCM_FILENAME_REGEX,
18+
download_tcia_series_instance,
19+
get_tcia_metadata,
20+
get_tcia_ref_uid,
21+
match_tcia_ref_uid_in_study,
22+
)

monai/apps/tcia/utils.py

Lines changed: 10 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -21,10 +21,18 @@
2121
requests_get, has_requests = optional_import("requests", name="get")
2222
pd, has_pandas = optional_import("pandas")
2323

24-
__all__ = ["get_tcia_metadata", "download_tcia_series_instance", "get_tcia_ref_uid", "match_tcia_ref_uid_in_study"]
25-
24+
DCM_FILENAME_REGEX = r"^(?!.*LICENSE).*" # excluding the file with "LICENSE" in its name
2625
BASE_URL = "https://services.cancerimagingarchive.net/nbia-api/services/v1/"
2726

27+
__all__ = [
28+
"get_tcia_metadata",
29+
"download_tcia_series_instance",
30+
"get_tcia_ref_uid",
31+
"match_tcia_ref_uid_in_study",
32+
"DCM_FILENAME_REGEX",
33+
"BASE_URL",
34+
]
35+
2836

2937
def get_tcia_metadata(query: str, attribute: str | None = None) -> list:
3038
"""

monai/data/image_reader.py

Lines changed: 18 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,7 @@
1313

1414
import glob
1515
import os
16+
import re
1617
import warnings
1718
from abc import ABC, abstractmethod
1819
from collections.abc import Callable, Iterable, Iterator, Sequence
@@ -403,8 +404,12 @@ class PydicomReader(ImageReader):
403404
label_dict: label of the dicom data. If provided, it will be used when loading segmentation data.
404405
Keys of the dict are the classes, and values are the corresponding class number. For example:
405406
for TCIA collection "C4KC-KiTS", it can be: {"Kidney": 0, "Renal Tumor": 1}.
407+
fname_regex: a regular expression to match the file names when the input is a folder.
408+
If provided, only the matched files will be included. For example, to include the file name
409+
"image_0001.dcm", the regular expression could be `".*image_(\\d+).dcm"`. Default to `""`.
410+
Set it to `None` to use `pydicom.misc.is_dicom` to match valid files.
406411
kwargs: additional args for `pydicom.dcmread` API. more details about available args:
407-
https://pydicom.github.io/pydicom/stable/reference/generated/pydicom.filereader.dcmread.html#pydicom.filereader.dcmread
412+
https://pydicom.github.io/pydicom/stable/reference/generated/pydicom.filereader.dcmread.html
408413
If the `get_data` function will be called
409414
(for example, when using this reader with `monai.transforms.LoadImage`), please ensure that the argument
410415
`stop_before_pixels` is `True`, and `specific_tags` covers all necessary tags, such as `PixelSpacing`,
@@ -418,6 +423,7 @@ def __init__(
418423
swap_ij: bool = True,
419424
prune_metadata: bool = True,
420425
label_dict: dict | None = None,
426+
fname_regex: str = "",
421427
**kwargs,
422428
):
423429
super().__init__()
@@ -427,6 +433,7 @@ def __init__(
427433
self.swap_ij = swap_ij
428434
self.prune_metadata = prune_metadata
429435
self.label_dict = label_dict
436+
self.fname_regex = fname_regex
430437

431438
def verify_suffix(self, filename: Sequence[PathLike] | PathLike) -> bool:
432439
"""
@@ -467,9 +474,16 @@ def read(self, data: Sequence[PathLike] | PathLike, **kwargs):
467474
name = f"{name}"
468475
if Path(name).is_dir():
469476
# read DICOM series
470-
series_slcs = glob.glob(os.path.join(name, "*"))
471-
series_slcs = [slc for slc in series_slcs if "LICENSE" not in slc]
472-
slices = [pydicom.dcmread(fp=slc, **kwargs_) for slc in series_slcs]
477+
if self.fname_regex is not None:
478+
series_slcs = [slc for slc in glob.glob(os.path.join(name, "*")) if re.match(self.fname_regex, slc)]
479+
else:
480+
series_slcs = [slc for slc in glob.glob(os.path.join(name, "*")) if pydicom.misc.is_dicom(slc)]
481+
slices = []
482+
for slc in series_slcs:
483+
try:
484+
slices.append(pydicom.dcmread(fp=slc, **kwargs_))
485+
except pydicom.errors.InvalidDicomError as e:
486+
warnings.warn(f"Failed to read {slc} with exception: \n{e}.", stacklevel=2)
473487
img_.append(slices if len(slices) > 1 else slices[0])
474488
if len(slices) > 1:
475489
self.has_series = True

tests/test_load_image.py

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -226,6 +226,11 @@ def test_itk_dicom_series_reader(self, input_param, filenames, expected_shape, e
226226
)
227227
self.assertTupleEqual(result.shape, expected_np_shape)
228228

229+
def test_no_files(self):
230+
with self.assertRaisesRegex(RuntimeError, "list index out of range"): # fname_regex excludes everything
231+
LoadImage(image_only=True, reader="PydicomReader", fname_regex=r"^(?!.*).*")("tests/testing_data/CT_DICOM")
232+
LoadImage(image_only=True, reader="PydicomReader", fname_regex=None)("tests/testing_data/CT_DICOM")
233+
229234
def test_itk_dicom_series_reader_single(self):
230235
result = LoadImage(image_only=True, reader="ITKReader")(self.data_dir)
231236
self.assertEqual(result.meta["filename_or_obj"], f"{Path(self.data_dir)}")

tests/test_tciadataset.py

Lines changed: 22 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -16,7 +16,7 @@
1616
import unittest
1717

1818
from monai.apps import TciaDataset
19-
from monai.apps.tcia import TCIA_LABEL_DICT
19+
from monai.apps.tcia import DCM_FILENAME_REGEX, TCIA_LABEL_DICT
2020
from monai.data import MetaTensor
2121
from monai.transforms import Compose, EnsureChannelFirstd, LoadImaged, ScaleIntensityd
2222
from tests.utils import skip_if_downloading_fails, skip_if_quick
@@ -32,7 +32,12 @@ def test_values(self):
3232

3333
transform = Compose(
3434
[
35-
LoadImaged(keys=["image", "seg"], reader="PydicomReader", label_dict=TCIA_LABEL_DICT[collection]),
35+
LoadImaged(
36+
keys=["image", "seg"],
37+
reader="PydicomReader",
38+
fname_regex=DCM_FILENAME_REGEX,
39+
label_dict=TCIA_LABEL_DICT[collection],
40+
),
3641
EnsureChannelFirstd(keys="image", channel_dim="no_channel"),
3742
ScaleIntensityd(keys="image"),
3843
]
@@ -82,10 +87,24 @@ def _test_dataset(dataset):
8287
self.assertTupleEqual(data[0]["image"].shape, (256, 256, 24))
8388
self.assertEqual(len(data), int(download_len * val_frac))
8489
data = TciaDataset(
85-
root_dir=testing_dir, collection=collection, section="validation", download=False, val_frac=val_frac
90+
root_dir=testing_dir,
91+
collection=collection,
92+
section="validation",
93+
download=False,
94+
fname_regex=DCM_FILENAME_REGEX,
95+
val_frac=val_frac,
8696
)
8797
self.assertTupleEqual(data[0]["image"].shape, (256, 256, 24))
8898
self.assertEqual(len(data), download_len)
99+
with self.assertWarns(UserWarning):
100+
data = TciaDataset(
101+
root_dir=testing_dir,
102+
collection=collection,
103+
section="validation",
104+
fname_regex=".*", # all files including 'LICENSE' is not a valid input
105+
download=False,
106+
val_frac=val_frac,
107+
)[0]
89108

90109
shutil.rmtree(os.path.join(testing_dir, collection))
91110
try:

0 commit comments

Comments
 (0)