Skip to content

Commit

Permalink
Merge branch '191-edsnlp-09' into 'main'
Browse files Browse the repository at this point in the history
Upgrade to EDS-NLP 0.9

See merge request heka/medkit!229

changelog: Upgrade to EDS-NLP 0.9
  • Loading branch information
olvb committed Nov 23, 2023
2 parents 213bee0 + 22153fc commit 6ddd8a8
Show file tree
Hide file tree
Showing 9 changed files with 212 additions and 221 deletions.
27 changes: 9 additions & 18 deletions docs/examples/edsnlp.md
Original file line number Diff line number Diff line change
Expand Up @@ -109,23 +109,6 @@ for attr in date_seg.attrs:
print(f"{attr.label}={attr.value}")
```

You may notice that the attributes created by the EDS-NLP components have been
slightly transformed. For instance, `eds.hypothesis` creates identical
`"hypothesis"` and `"hypothesis_"` attributes, as well as an optional
`"hypothesis_cues"` attribute. When transforming these back to medkit, the
redundant `"hypothesis_"` attribute is dropped, and `"hypothesis_cues"` is
integrated as additional metadata of the `"hypothesis"` attribute (if present).
{class}`~.EDSNLPDocPipeline` will perform this sort of transformation for many
other EDS-NLP components.

```{note}
The transformations performed by {class}`~.EDSNLPDocPipeline` can be overriden
or extended with the `medkit_attribute_factories` init parameter. For a list of
all the default transformations, see
{const}`~medkit.text.spacy.edsnlp.DEFAULT_ATTRIBUTE_FACTORIES` and corresponding
functions in {mod}`medkit.text.spacy.edsnlp`.
```

Let's now examine more closely the `"date"` attribute:

```{code-cell} ipython3
Expand All @@ -149,11 +132,19 @@ attributes are automatically converted to a corresponding

Here are the supported EDS-NLP attributes values and the corresponding medkit classes:
- `AdicapCode` (created by `eds.adicap`): {class}`medkit.text.ner.ADICAPNormAttribute`
- `TNM` (created by `eds.TNM`): {class}`medkit.text.ner.tnm_attribute.TNMAttribute`
- `TNM` (created by `eds.tnm`): {class}`medkit.text.ner.tnm_attribute.TNMAttribute`
- `AbsoluteDate` (created by `eds.dates`): {class}`medkit.text.ner.DateAttribute`
- `RelativeDate` (created by `eds.dates`): {class}`medkit.text.ner.RelativeDateAttribute`
- `Duration` (created by `eds.dates`): {class}`medkit.text.ner.DurationAttribute`

```{note}
The transformations performed by {class}`~.EDSNLPDocPipeline` can be overriden
or extended with the `medkit_attribute_factories` init parameter. For a list of
all the default transformations, see
{const}`~medkit.text.spacy.edsnlp.DEFAULT_ATTRIBUTE_FACTORIES` and corresponding
functions in {mod}`medkit.text.spacy.edsnlp`.
```

## Running an EDL-NLP spaCy pipeline at the annotation level

So far, we have wrapped a spaCy pipeline and executed it on an entire document
Expand Down
75 changes: 40 additions & 35 deletions medkit/text/ner/edsnlp_date_matcher.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@
from medkit.core.text.operation import NEROperation
from medkit.core.text import Segment, Entity
from medkit.core.text import span_utils
from medkit.text.spacy.edsnlp import build_date_attribute
from medkit.text.spacy.edsnlp import build_date_attribute, build_duration_attribute


class EDSNLPDateMatcher(NEROperation):
Expand All @@ -27,8 +27,8 @@ class EDSNLPDateMatcher(NEROperation):
For each date that is found, an entity will be created with an attribute
attached to it containing normalized values of the date components. The
attribute label will always be "date", and the class of the attribute will
be either class :class:`~medkit.text.ner.DateAttribute`,
attribute label will be either "date" or "duration", and the class of the
attribute will be either class :class:`~medkit.text.ner.DateAttribute`,
:class:`~medkit.text.ner.RelativeDateAttribute` or
:class:`~medkit.text.ner.DurationAttribute`.
"""
Expand All @@ -44,7 +44,7 @@ def __init__(
----------
output_label:
Label to use for date entities created (the label of the
attributes will always be "date")
attributes will always be "date" or "duration")
attrs_to_copy:
Labels of the attributes that should be copied from the input segment
to the created date entity. Useful for propagating context attributes
Expand Down Expand Up @@ -92,37 +92,42 @@ def run(self, segments: List[Segment]) -> List[Entity]:

def _find_dates_in_segment(self, segment, spacy_doc) -> Iterator[Entity]:
for spacy_span in spacy_doc.spans["dates"]:
# convert span span start/end to medkit spans relative to doc
text, spans = span_utils.extract(
segment.text,
segment.spans,
[(spacy_span.start_char, spacy_span.end_char)],
yield self._build_entity(segment, spacy_span, is_duration=False)
for spacy_span in spacy_doc.spans["durations"]:
yield self._build_entity(segment, spacy_span, is_duration=True)

def _build_entity(self, segment, spacy_span, is_duration) -> Entity:
# convert span span start/end to medkit spans relative to doc
text, spans = span_utils.extract(
segment.text,
segment.spans,
[(spacy_span.start_char, spacy_span.end_char)],
)
# create attribute storing normalized date or duration values
attr = (
build_duration_attribute(spacy_span=spacy_span, spacy_label="duration")
if is_duration
else build_date_attribute(spacy_span=spacy_span, spacy_label="date")
)
# create entity
entity = Entity(label=self.output_label, spans=spans, text=text, attrs=[attr])

# handle provenance
if self._prov_tracer is not None:
self._prov_tracer.add_prov(
entity, self.description, source_data_items=[segment]
)
# create attribute storing normalized date values
attr = build_date_attribute(spacy_span=spacy_span, spacy_label="date")
# create entity
entity = Entity(
label=self.output_label, spans=spans, text=text, attrs=[attr]
self._prov_tracer.add_prov(
attr, self.description, source_data_items=[segment]
)

# handle provenance
if self._prov_tracer is not None:
self._prov_tracer.add_prov(
entity, self.description, source_data_items=[segment]
)
self._prov_tracer.add_prov(
attr, self.description, source_data_items=[segment]
)

# copy attrs from source segment to date entity
for label in self.attrs_to_copy:
for attr in segment.attrs.get(label=label):
copied_attr = attr.copy()
entity.attrs.add(copied_attr)
# handle provenance
if self._prov_tracer is not None:
self._prov_tracer.add_prov(
copied_attr, self.description, [attr]
)

yield entity
# copy attrs from source segment to date entity
for label in self.attrs_to_copy:
for attr in segment.attrs.get(label=label):
copied_attr = attr.copy()
entity.attrs.add(copied_attr)
# handle provenance
if self._prov_tracer is not None:
self._prov_tracer.add_prov(copied_attr, self.description, [attr])

return entity
6 changes: 3 additions & 3 deletions medkit/text/ner/edsnlp_tnm_matcher.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@
from medkit.core.text.operation import NEROperation
from medkit.core.text import Segment, Entity
from medkit.core.text import span_utils
from medkit.text.spacy.edsnlp import build_value_attribute
from medkit.text.spacy.edsnlp import build_tnm_attribute
from medkit.text.ner.tnm_attribute import TNMAttribute


Expand Down Expand Up @@ -57,7 +57,7 @@ def __init__(
self.attrs_to_copy = attrs_to_copy

self._edsnlp = spacy.blank("eds")
self._edsnlp.add_pipe("eds.TNM")
self._edsnlp.add_pipe("eds.tnm")

def run(self, segments: List[Segment]) -> List[Entity]:
"""Find and return TNM entities for all `segments`
Expand Down Expand Up @@ -95,7 +95,7 @@ def _find_tnms_in_segment(self, segment, spacy_doc) -> Iterator[Entity]:

# create attribute storing normalized TNM values
# (only TNM attributes should be found)
attr = build_value_attribute(spacy_span=spacy_span, spacy_label="value")
attr = build_tnm_attribute(spacy_span=spacy_span, spacy_label="tnm")
assert isinstance(attr, TNMAttribute)

# create entity
Expand Down
2 changes: 1 addition & 1 deletion medkit/text/ner/tnm_attribute.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@
from typing import Any, ClassVar, Dict, Optional
from typing_extensions import Self

from edsnlp.pipelines.ner.scores.tnm.models import (
from edsnlp.pipelines.ner.tnm.model import (
TNM,
Prefix,
Tumour,
Expand Down
Loading

0 comments on commit 6ddd8a8

Please sign in to comment.