refactor: cleaning up reader (#141)

* refactor: cleaning up reader * cleanup roi read * update docs and ruff rules * update readme * coverage * coverage * add note * add pragma
tlambert03 · Jun 15, 2023 · 9b91e1f · 9b91e1f
1 parent 0d1242b
commit 9b91e1f
Show file tree

Hide file tree

Showing 19 changed files with 397 additions and 347 deletions.
diff --git a/README.md b/README.md
@@ -37,14 +37,25 @@ or from conda:
 conda install -c conda-forge nd2
 ```
 
-### extras
+### Legacy nd2 file support
 
-Legacy nd2 (JPEG2000) files are also supported, but require `imagecodecs`.  To install with support for these files use:
+Legacy nd2 (JPEG2000) files are also supported, but require `imagecodecs`.  To
+install with support for these files use the `legacy` extra:
 
 ```sh
 pip install nd2[legacy]
 ```
 
+### Faster XML parsing
+
+Much of the metadata in the file stored as XML.  If found in the environment,
+`nd2` will use [`lxml`](https://pypi.org/project/lxml/) which is much faster
+than the built-in `xml` module.  To install with support for `lxml` use:
+
+```sh
+pip install nd2 lxml
+```
+
 ## usage and API
 
 ```python

diff --git a/pyproject.toml b/pyproject.toml
@@ -9,7 +9,7 @@ description = "Yet another nd2 (Nikon NIS Elements) file reader"
 readme = "README.md"
 requires-python = ">=3.7"
 license = { text = "BSD 3-Clause License" }
-authors = [{ email = "talley.lambert@gmail.com" }, { name = "Talley Lambert" }]
+authors = [{ email = "talley.lambert@gmail.com", name = "Talley Lambert" }]
 classifiers = [
     "Development Status :: 3 - Alpha",
     "License :: OSI Approved :: BSD License",
@@ -71,35 +71,27 @@ version-file = "src/nd2/_version.py"
 only-include = ["src"]
 sources = ["src"]
 
-# https://pycqa.github.io/isort/docs/configuration/options.html
-[tool.isort]
-profile = "black"
-src_paths = ["src/nd2", "tests"]
-
-# https://github.com/charliermarsh/ruff
+# https://beta.ruff.rs/docs/rules/
 [tool.ruff]
 line-length = 88
 target-version = "py37"
 src = ["src/nd2", "tests"]
 select = [
-    "E",    # style errors
-    "F",    # flakes
-    "D",    # pydocstyle
-    "I",    # isort
-    "UP",   # pyupgrade
-    "S",    # bandit
-    "C",    # flake8-comprehensions
-    "B",    # flake8-bugbear
-    "A001", # flake8-builtins
-    "RUF",  # ruff-specific rules
-    "TCH",  # flake8-type-checking
+    "E",      # style errors
+    "F",      # flakes
+    "D",      # pydocstyle
+    "I",      # isort
+    "UP",     # pyupgrade
+    "S",      # bandit
+    "C4",     # flake8-comprehensions
+    "B",      # flake8-bugbear
+    "A001",   # flake8-builtins
+    "RUF",    # ruff-specific rules
+    "SIM105", # contextlib.suppress
+    "TID",    # tidy imports
+    "TCH",    # flake8-type-checking
 ]
 ignore = [
-    # these should be fixed
-    "D101",
-    "D105",
-    "D103",
-    ###
     "D100", # Missing docstring in public module
     "D107", # Missing docstring in __init__
     "D203", # 1 blank line required before class docstring
@@ -113,13 +105,14 @@ ignore = [
 ]
 
 [tool.ruff.per-file-ignores]
+"src/nd2/structures.py" = ["D101", "D105"] # Fix someday
 "tests/*.py" = ["D", "S"]
 "scripts/*.py" = ["D", "S"]
 
 # https://docs.pytest.org/en/6.2.x/customize.html
 [tool.pytest.ini_options]
 minversion = "6.0"
-addopts = '--color=yes'
+addopts = '--color=yes --cov-config=pyproject.toml'
 testpaths = ["tests"]
 filterwarnings = [
     "error",
@@ -143,6 +136,7 @@ ignore_missing_imports = true
 
 # https://coverage.readthedocs.io/en/6.4/config.html
 [tool.coverage.report]
+show_missing = true
 exclude_lines = [
     "pragma: no cover",
     "if TYPE_CHECKING:",
@@ -153,7 +147,7 @@ exclude_lines = [
 ]
 
 [tool.coverage.run]
-omit = ["tests"]
+source = ["src"]
 
 # https://github.com/mgedmin/check-manifest#configuration
 [tool.check-manifest]

diff --git a/scripts/gather.py b/scripts/gather.py
@@ -1,4 +1,5 @@
 """gather metadata from all files in test/data with all nd readers."""
+import contextlib
 import json
 from pathlib import Path
 
@@ -19,10 +20,9 @@ def get_nd2_stats(file) -> dict:
         d["pixel_size"] = m.channels[0].volume.axesCalibration
     d["shape"] = fh.shape
     d["axes"] = fh.axes
-    try:
+    with contextlib.suppress(Exception):
         d["dtype"] = str(fh.dtype)
-    except Exception:
-        pass
+
     fh.close()
     return d
 

diff --git a/src/nd2/_binary.py b/src/nd2/_binary.py
@@ -157,7 +157,7 @@ def asarray(self) -> np.ndarray:
     @classmethod
     def from_nd2file(cls, nd2file: ND2File) -> BinaryLayers | None:
         """Extract binary layers from an ND2 file."""
-        if nd2file.is_legacy:
+        if nd2file.is_legacy:  # pragma: no cover
             warnings.warn(
                 "`binary_data` is not supported for legacy ND2 files",
                 UserWarning,
@@ -166,30 +166,31 @@ def from_nd2file(cls, nd2file: ND2File) -> BinaryLayers | None:
             return None
         rdr = cast("LatestSDKReader", nd2file._rdr)
 
-        binary_meta = rdr._decoded_custom_data_chunk(
-            b"BinaryMetadata_v1!", strip_prefix=True
-        )
-
-        if not binary_meta:
+        try:
+            binary_meta = rdr._decode_chunk(
+                b"CustomDataVar|BinaryMetadata_v1!", strip_prefix=True
+            )
+        except KeyError:
             return None
+
         try:
             items: dict = binary_meta["BinaryMetadata_v1"]
-        except KeyError:
+        except KeyError:  # pragma: no cover
             warnings.warn(
                 "Could not find 'BinaryMetadata_v1' tag, please open an "
                 "issue with this file at https://github.com/tlambert03/nd2/issues/new",
                 stacklevel=2,
             )
             return None
 
-        binseqs = sorted(x for x in rdr._meta_map if "RleZipBinarySequence" in x)
+        binseqs = sorted(x for x in rdr.chunkmap if b"RleZipBinarySequence" in x)
         mask_items = []
         for _, item in sorted(items.items()):
-            key = item["FileTag"]
+            key = item["FileTag"].encode()
             _masks: list[np.ndarray | None] = []
             for bs in binseqs:
                 if key in bs:
-                    data = rdr._load_chunk(f"{bs}!".encode())[4:]
+                    data = rdr._load_chunk(bs)[4:]
                     _masks.append(_decode_binary_mask(data) if data else None)
             mask_items.append(
                 BinaryLayer(
@@ -216,7 +217,6 @@ def _unpack(stream: io.BufferedIOBase, strct: struct.Struct) -> tuple:
 def _decode_binary_mask(data: bytes, dtype: DTypeLike = "uint16") -> np.ndarray:
     # this receives data as would be extracted from a
     # `CustomDataSeq|RleZipBinarySequence...` section in the metadata
-    # data = f._rdr._get_meta_chunk('CustomDataSeq|RleZipBinarySequence_1_v1|0')[:4]
 
     # NOTE it is up to ND2File to strip the first 4 bytes... and not call this if there
     # is no data (i.e. if the chunk is just '\x00')

diff --git a/src/nd2/_clx_lite.py b/src/nd2/_clx_lite.py
@@ -115,6 +115,8 @@ def _chunk_name_and_dtype(
 
     data_type, name_length = strctBB.unpack(header)
     if data_type == ELxLiteVariantType.COMPRESS:
+        # NOTE: the rois.nd2 test file has compressed metadata
+        # in b'CustomData|CustomDescriptionV1_0!'
         raise NotImplementedError("Compressed metadata not yet implemented.")
     if data_type in (ELxLiteVariantType.DEPRECATED, ELxLiteVariantType.UNKNOWN):
         raise ValueError(f"Unknown data type in metadata header: {data_type}")

diff --git a/src/nd2/_clx_xml.py b/src/nd2/_clx_xml.py
@@ -10,10 +10,11 @@
     import lxml.etree
 
     Element = Union[xml.etree.ElementTree.Element, lxml.etree._Element]
-    Parser = Callable[[bytes], Element]
+    Parser = Callable[[bytes | str], Element]
     Scalar = Union[float, str, int, bytearray, bool]
     JsonValue = Union[Scalar, dict[str, "JsonValue"]]
     XML: Parser
+    ParseError: Exception
 
 else:
     try:
@@ -73,7 +74,15 @@ def json_from_clx_variant(
         on the XML structure. (A <variant><no_name>...</no_name></variant> is the most
         likely case where a scalar is returned.)
     """
-    node = parser(bxml.split(b"?>", 1)[-1])  # strip xml header
+    if bxml.startswith(b"<?xml"):
+        bxml = bxml.split(b"?>", 1)[-1]  # strip xml header
+
+    try:
+        node = parser(bxml)
+    except SyntaxError:  # when there are invalid characters in the XML
+        # could go straight to this ... not sure if it's slower
+        node = parser(bxml.decode(encoding="utf-8", errors="ignore"))
+
     is_legacy = node.attrib.get("_VERSION") == "1.000000"
     name, val = _node_name_value(node, strip_prefix, include_attrs=is_legacy)
 
@@ -123,7 +132,17 @@ def _node_name_value(
             # NOTE: "no_name" is the standard name for a list-type node
             # "BinaryItem" is a special case found in the BinaryMetadata_v1 tag...
             # without special handling, you would only get the last item in the list
-            if cname in ("no_name", None, "", "BinaryItem", "TextInfoItem"):
+            # FIXME: handle the special cases below "" better.
+            if cname in (
+                "no_name",
+                None,
+                "",
+                "BinaryItem",
+                "TextInfoItem",
+                "Wavelength",
+                "MinSrc",
+                "MaxSrc",
+            ):
                 if not cval:
                     # skip empty nodes ... the sdk does this too
                     continue

diff --git a/src/nd2/_legacy/_legacy.py b/src/nd2/_legacy/_legacy.py
@@ -13,8 +13,9 @@
 
 import numpy as np
 
-from .. import structures as strct
-from .._util import AXIS, VoxelSize
+from nd2 import structures as strct
+from nd2._util import AXIS, VoxelSize
+
 from ._legacy_xml import parse_xml_block
 
 if TYPE_CHECKING:

diff --git a/src/nd2/_pysdk/_parse.py b/src/nd2/_pysdk/_parse.py
@@ -26,6 +26,7 @@
         RawAttributesDict,
         RawExperimentDict,
         RawMetaDict,
+        RawTextInfoDict,
         SpectLoopPars,
         SpectrumDict,
         TimeLoopPars,
@@ -237,7 +238,7 @@ def _load_single_experiment_loop(
         count = loop_params.get("pPlanes", {}).get("uiCount", count)
         return strct.SpectLoop(count=count)
 
-    raise NotImplementedError(
+    raise NotImplementedError(  # pragma: no cover
         f"We've never seen a file like this! (loop_type={loop_type!r}). We'd "
         "appreciate it if you would submit this file at "
         "https://github.com/tlambert03/nd2/issues/new",
@@ -347,28 +348,29 @@ def _get_spectrum_max(item: SpectrumDict | None) -> float:
     return max(spectrum, key=lambda x: x[0])[1] if spectrum else 0.0
 
 
-def load_text_info(src: dict) -> strct.TextInfo:
-    # we only want keys that are present in the src
+def load_text_info(raw_txt_info: RawTextInfoDict) -> strct.TextInfo:
+    # we only want keys that are present in the raw_txt_info
+
     out = {
-        key: src[lookup]
+        key: raw_txt_info.get(lookup)
         for key, lookup in (
-            ("appVersion", "TextInfoItem_14"),
+            ("imageId", "TextInfoItem_0"),
+            ("type", "TextInfoItem_1"),
+            ("group", "TextInfoItem_2"),
+            ("sampleId", "TextInfoItem_3"),
             ("author", "TextInfoItem_4"),
+            ("description", "TextInfoItem_5"),
             ("capturing", "TextInfoItem_6"),
-            ("conclusion", "TextInfoItem_10"),
+            ("sampling", "TextInfoItem_7"),
+            ("location", "TextInfoItem_8"),
             ("date", "TextInfoItem_9"),
-            ("description", "TextInfoItem_5"),
-            ("group", "TextInfoItem_2"),
-            ("imageId", "TextInfoItem_0"),
+            ("conclusion", "TextInfoItem_10"),
             ("info1", "TextInfoItem_11"),
             ("info2", "TextInfoItem_12"),
-            ("location", "TextInfoItem_8"),
             ("optics", "TextInfoItem_13"),
-            ("sampleId", "TextInfoItem_3"),
-            ("sampling", "TextInfoItem_7"),
-            ("type", "TextInfoItem_1"),
+            ("appVersion", "TextInfoItem_14"),
         )
-        if src.get(lookup)
+        if raw_txt_info.get(lookup)
     }
     return cast(strct.TextInfo, out)