Merge branch '191-edsnlp-09' into 'main'

Upgrade to EDS-NLP 0.9 See merge request heka/medkit!229 changelog: Upgrade to EDS-NLP 0.9
medkit-lib · Nov 23, 2023 · 6ddd8a8 · 6ddd8a8
2 parents 213bee0 + 22153fc
commit 6ddd8a8
Show file tree

Hide file tree

Showing 9 changed files with 212 additions and 221 deletions.
diff --git a/docs/examples/edsnlp.md b/docs/examples/edsnlp.md
@@ -109,23 +109,6 @@ for attr in date_seg.attrs:
     print(f"{attr.label}={attr.value}")
 ```
 
-You may notice that the attributes created by the EDS-NLP components have been
-slightly transformed. For instance, `eds.hypothesis` creates identical
-`"hypothesis"` and `"hypothesis_"` attributes, as well as an optional
-`"hypothesis_cues"` attribute. When transforming these back to medkit, the
-redundant `"hypothesis_"` attribute is dropped, and `"hypothesis_cues"` is
-integrated as additional metadata of the `"hypothesis"` attribute (if present).
-{class}`~.EDSNLPDocPipeline` will perform this sort of transformation for many
-other EDS-NLP components.
-
-```{note}
-The transformations performed by {class}`~.EDSNLPDocPipeline` can be overriden
-or extended with the `medkit_attribute_factories` init parameter. For a list of
-all the default transformations, see
-{const}`~medkit.text.spacy.edsnlp.DEFAULT_ATTRIBUTE_FACTORIES` and corresponding
-functions in {mod}`medkit.text.spacy.edsnlp`.
-```
-
 Let's now examine more closely the `"date"` attribute:
 
 ```{code-cell} ipython3
@@ -149,11 +132,19 @@ attributes are automatically converted to a corresponding
 
 Here are the supported EDS-NLP attributes values and the corresponding medkit classes:
 - `AdicapCode` (created by `eds.adicap`): {class}`medkit.text.ner.ADICAPNormAttribute`
-- `TNM` (created by `eds.TNM`): {class}`medkit.text.ner.tnm_attribute.TNMAttribute`
+- `TNM` (created by `eds.tnm`): {class}`medkit.text.ner.tnm_attribute.TNMAttribute`
 - `AbsoluteDate` (created by `eds.dates`): {class}`medkit.text.ner.DateAttribute`
 - `RelativeDate` (created by `eds.dates`): {class}`medkit.text.ner.RelativeDateAttribute`
 - `Duration` (created by `eds.dates`): {class}`medkit.text.ner.DurationAttribute`
 
+```{note}
+The transformations performed by {class}`~.EDSNLPDocPipeline` can be overriden
+or extended with the `medkit_attribute_factories` init parameter. For a list of
+all the default transformations, see
+{const}`~medkit.text.spacy.edsnlp.DEFAULT_ATTRIBUTE_FACTORIES` and corresponding
+functions in {mod}`medkit.text.spacy.edsnlp`.
+```
+
 ## Running an EDL-NLP spaCy pipeline at the annotation level
 
 So far, we have wrapped a spaCy pipeline and executed it on an entire document

diff --git a/medkit/text/ner/edsnlp_date_matcher.py b/medkit/text/ner/edsnlp_date_matcher.py
@@ -12,7 +12,7 @@
 from medkit.core.text.operation import NEROperation
 from medkit.core.text import Segment, Entity
 from medkit.core.text import span_utils
-from medkit.text.spacy.edsnlp import build_date_attribute
+from medkit.text.spacy.edsnlp import build_date_attribute, build_duration_attribute
 
 
 class EDSNLPDateMatcher(NEROperation):
@@ -27,8 +27,8 @@ class EDSNLPDateMatcher(NEROperation):
 
     For each date that is found, an entity will be created with an attribute
     attached to it containing normalized values of the date components. The
-    attribute label will always be "date", and the class of the attribute will
-    be either class :class:`~medkit.text.ner.DateAttribute`,
+    attribute label will be either "date" or "duration", and the class of the
+    attribute will be either class :class:`~medkit.text.ner.DateAttribute`,
     :class:`~medkit.text.ner.RelativeDateAttribute` or
     :class:`~medkit.text.ner.DurationAttribute`.
     """
@@ -44,7 +44,7 @@ def __init__(
         ----------
         output_label:
             Label to use for date entities created (the label of the
-            attributes will always be "date")
+            attributes will always be "date" or "duration")
         attrs_to_copy:
             Labels of the attributes that should be copied from the input segment
             to the created date entity. Useful for propagating context attributes
@@ -92,37 +92,42 @@ def run(self, segments: List[Segment]) -> List[Entity]:
 
     def _find_dates_in_segment(self, segment, spacy_doc) -> Iterator[Entity]:
         for spacy_span in spacy_doc.spans["dates"]:
-            # convert span span start/end to medkit spans relative to doc
-            text, spans = span_utils.extract(
-                segment.text,
-                segment.spans,
-                [(spacy_span.start_char, spacy_span.end_char)],
+            yield self._build_entity(segment, spacy_span, is_duration=False)
+        for spacy_span in spacy_doc.spans["durations"]:
+            yield self._build_entity(segment, spacy_span, is_duration=True)
+
+    def _build_entity(self, segment, spacy_span, is_duration) -> Entity:
+        # convert span span start/end to medkit spans relative to doc
+        text, spans = span_utils.extract(
+            segment.text,
+            segment.spans,
+            [(spacy_span.start_char, spacy_span.end_char)],
+        )
+        # create attribute storing normalized date or duration values
+        attr = (
+            build_duration_attribute(spacy_span=spacy_span, spacy_label="duration")
+            if is_duration
+            else build_date_attribute(spacy_span=spacy_span, spacy_label="date")
+        )
+        # create entity
+        entity = Entity(label=self.output_label, spans=spans, text=text, attrs=[attr])
+
+        # handle provenance
+        if self._prov_tracer is not None:
+            self._prov_tracer.add_prov(
+                entity, self.description, source_data_items=[segment]
             )
-            # create attribute storing normalized date values
-            attr = build_date_attribute(spacy_span=spacy_span, spacy_label="date")
-            # create entity
-            entity = Entity(
-                label=self.output_label, spans=spans, text=text, attrs=[attr]
+            self._prov_tracer.add_prov(
+                attr, self.description, source_data_items=[segment]
             )
 
-            # handle provenance
-            if self._prov_tracer is not None:
-                self._prov_tracer.add_prov(
-                    entity, self.description, source_data_items=[segment]
-                )
-                self._prov_tracer.add_prov(
-                    attr, self.description, source_data_items=[segment]
-                )
-
-            # copy attrs from source segment to date entity
-            for label in self.attrs_to_copy:
-                for attr in segment.attrs.get(label=label):
-                    copied_attr = attr.copy()
-                    entity.attrs.add(copied_attr)
-                    # handle provenance
-                    if self._prov_tracer is not None:
-                        self._prov_tracer.add_prov(
-                            copied_attr, self.description, [attr]
-                        )
-
-            yield entity
+        # copy attrs from source segment to date entity
+        for label in self.attrs_to_copy:
+            for attr in segment.attrs.get(label=label):
+                copied_attr = attr.copy()
+                entity.attrs.add(copied_attr)
+                # handle provenance
+                if self._prov_tracer is not None:
+                    self._prov_tracer.add_prov(copied_attr, self.description, [attr])
+
+        return entity
diff --git a/medkit/text/ner/edsnlp_tnm_matcher.py b/medkit/text/ner/edsnlp_tnm_matcher.py
@@ -12,7 +12,7 @@
 from medkit.core.text.operation import NEROperation
 from medkit.core.text import Segment, Entity
 from medkit.core.text import span_utils
-from medkit.text.spacy.edsnlp import build_value_attribute
+from medkit.text.spacy.edsnlp import build_tnm_attribute
 from medkit.text.ner.tnm_attribute import TNMAttribute
 
 
@@ -57,7 +57,7 @@ def __init__(
         self.attrs_to_copy = attrs_to_copy
 
         self._edsnlp = spacy.blank("eds")
-        self._edsnlp.add_pipe("eds.TNM")
+        self._edsnlp.add_pipe("eds.tnm")
 
     def run(self, segments: List[Segment]) -> List[Entity]:
         """Find and return TNM entities for all `segments`
@@ -95,7 +95,7 @@ def _find_tnms_in_segment(self, segment, spacy_doc) -> Iterator[Entity]:
 
             # create attribute storing normalized TNM values
             # (only TNM attributes should be found)
-            attr = build_value_attribute(spacy_span=spacy_span, spacy_label="value")
+            attr = build_tnm_attribute(spacy_span=spacy_span, spacy_label="tnm")
             assert isinstance(attr, TNMAttribute)
 
             # create entity

diff --git a/medkit/text/ner/tnm_attribute.py b/medkit/text/ner/tnm_attribute.py
@@ -9,7 +9,7 @@
 from typing import Any, ClassVar, Dict, Optional
 from typing_extensions import Self
 
-from edsnlp.pipelines.ner.scores.tnm.models import (
+from edsnlp.pipelines.ner.tnm.model import (
     TNM,
     Prefix,
     Tumour,