Add displaCy data structures to docs (2) (#12875)

* Add data structures to docs * Adjusted descriptions for more consistency * Add _optional_ flag to parameters * Add tests and adjust optional title key in doc * Add title to dep visualizations * fix typo --------- Co-authored-by: thomashacker <EdwardSchmuhl@web.de>
explosion · Jul 31, 2023 · c9e9dcc · c9e9dcc
1 parent 49055ed
commit c9e9dcc
Show file tree

Hide file tree

Showing 4 changed files with 200 additions and 1 deletion.
diff --git a/spacy/displacy/render.py b/spacy/displacy/render.py
@@ -313,6 +313,8 @@ def render(
                 self.lang = settings.get("lang", DEFAULT_LANG)
             render_id = f"{id_prefix}-{i}"
             svg = self.render_svg(render_id, p["words"], p["arcs"])
+            if p.get("title"):
+                svg = TPL_TITLE.format(title=p.get("title")) + svg
             rendered.append(svg)
         if page:
             content = "".join([TPL_FIGURE.format(content=svg) for svg in rendered])

diff --git a/spacy/tests/test_displacy.py b/spacy/tests/test_displacy.py
@@ -350,6 +350,78 @@ def wrapper(html):
     displacy.set_render_wrapper(lambda html: html)
 
 
+def test_displacy_render_manual_dep():
+    """Test displacy.render with manual data for dep style"""
+    parsed_dep = {
+        "words": [
+            {"text": "This", "tag": "DT"},
+            {"text": "is", "tag": "VBZ"},
+            {"text": "a", "tag": "DT"},
+            {"text": "sentence", "tag": "NN"},
+        ],
+        "arcs": [
+            {"start": 0, "end": 1, "label": "nsubj", "dir": "left"},
+            {"start": 2, "end": 3, "label": "det", "dir": "left"},
+            {"start": 1, "end": 3, "label": "attr", "dir": "right"},
+        ],
+        "title": "Title",
+    }
+    html = displacy.render([parsed_dep], style="dep", manual=True)
+    for word in parsed_dep["words"]:
+        assert word["text"] in html
+        assert word["tag"] in html
+
+
+def test_displacy_render_manual_ent():
+    """Test displacy.render with manual data for ent style"""
+    parsed_ents = [
+        {
+            "text": "But Google is starting from behind.",
+            "ents": [{"start": 4, "end": 10, "label": "ORG"}],
+        },
+        {
+            "text": "But Google is starting from behind.",
+            "ents": [{"start": -100, "end": 100, "label": "COMPANY"}],
+            "title": "Title",
+        },
+    ]
+
+    html = displacy.render(parsed_ents, style="ent", manual=True)
+    for parsed_ent in parsed_ents:
+        assert parsed_ent["ents"][0]["label"] in html
+        if "title" in parsed_ent:
+            assert parsed_ent["title"] in html
+
+
+def test_displacy_render_manual_span():
+    """Test displacy.render with manual data for span style"""
+    parsed_spans = [
+        {
+            "text": "Welcome to the Bank of China.",
+            "spans": [
+                {"start_token": 3, "end_token": 6, "label": "ORG"},
+                {"start_token": 5, "end_token": 6, "label": "GPE"},
+            ],
+            "tokens": ["Welcome", "to", "the", "Bank", "of", "China", "."],
+        },
+        {
+            "text": "Welcome to the Bank of China.",
+            "spans": [
+                {"start_token": 3, "end_token": 6, "label": "ORG"},
+                {"start_token": 5, "end_token": 6, "label": "GPE"},
+            ],
+            "tokens": ["Welcome", "to", "the", "Bank", "of", "China", "."],
+            "title": "Title",
+        },
+    ]
+
+    html = displacy.render(parsed_spans, style="span", manual=True)
+    for parsed_span in parsed_spans:
+        assert parsed_span["spans"][0]["label"] in html
+        if "title" in parsed_span:
+            assert parsed_span["title"] in html
+
+
 def test_displacy_options_case():
     ents = ["foo", "BAR"]
     colors = {"FOO": "red", "bar": "green"}

diff --git a/website/docs/api/top-level.mdx b/website/docs/api/top-level.mdx
@@ -343,6 +343,130 @@ use with the `manual=True` argument in `displacy.render`.
 | `options`   | Span-specific visualisation options. ~~Dict[str, Any]~~             |
 | **RETURNS** | Generated entities keyed by text (original text) and ents. ~~dict~~ |
 
+### Visualizer data structures {id="displacy_structures"}
+
+You can use displaCy's data format to manually render data. This can be useful
+if you want to visualize output from other libraries. You can find examples of
+displaCy's different data formats below.
+
+> #### DEP example data structure
+>
+> ```json
+> {
+>   "words": [
+>     { "text": "This", "tag": "DT" },
+>     { "text": "is", "tag": "VBZ" },
+>     { "text": "a", "tag": "DT" },
+>     { "text": "sentence", "tag": "NN" }
+>   ],
+>   "arcs": [
+>     { "start": 0, "end": 1, "label": "nsubj", "dir": "left" },
+>     { "start": 2, "end": 3, "label": "det", "dir": "left" },
+>     { "start": 1, "end": 3, "label": "attr", "dir": "right" }
+>   ]
+> }
+> ```
+
+#### Dependency Visualizer data structure {id="structure-dep"}
+
+| Dictionary Key | Description                                                                                                 |
+| -------------- | ----------------------------------------------------------------------------------------------------------- |
+| `words`        | List of dictionaries describing a word token (see structure below). ~~List[Dict[str, Any]]~~                |
+| `arcs`         | List of dictionaries describing the relations between words (see structure below). ~~List[Dict[str, Any]]~~ |
+| _Optional_     |                                                                                                             |
+| `title`        | Title of the visualization. ~~Optional[str]~~                                                               |
+| `settings`     | Dependency Visualizer options (see [here](/api/top-level#displacy_options)). ~~Dict[str, Any]~~             |
+
+<Accordion title="Words data structure">
+
+| Dictionary Key | Description                              |
+| -------------- | ---------------------------------------- |
+| `text`         | Text content of the word. ~~str~~        |
+| `tag`          | Fine-grained part-of-speech. ~~str~~     |
+| `lemma`        | Base form of the word. ~~Optional[str]~~ |
+
+</Accordion>
+
+<Accordion title="Arcs data structure">
+
+| Dictionary Key | Description                                          |
+| -------------- | ---------------------------------------------------- |
+| `start`        | The index of the starting token. ~~int~~             |
+| `end`          | The index of the ending token. ~~int~~               |
+| `label`        | The type of dependency relation. ~~str~~             |
+| `dir`          | Direction of the relation (`left`, `right`). ~~str~~ |
+
+</Accordion>
+
+> #### ENT example data structure
+>
+> ```json
+> {
+>   "text": "But Google is starting from behind.",
+>   "ents": [{ "start": 4, "end": 10, "label": "ORG" }]
+> }
+> ```
+
+#### Named Entity Recognition data structure {id="structure-ent"}
+
+| Dictionary Key | Description                                                                                 |
+| -------------- | ------------------------------------------------------------------------------------------- |
+| `text`         | String representation of the document text. ~~str~~                                         |
+| `ents`         | List of dictionaries describing entities (see structure below). ~~List[Dict[str, Any]]~~    |
+| _Optional_     |                                                                                             |
+| `title`        | Title of the visualization. ~~Optional[str]~~                                               |
+| `settings`     | Entity Visualizer options (see [here](/api/top-level#displacy_options)). ~~Dict[str, Any]~~ |
+
+<Accordion title="Ents data structure">
+
+| Dictionary Key | Description                                                            |
+| -------------- | ---------------------------------------------------------------------- |
+| `start`        | The index of the first character of the entity. ~~int~~                |
+| `end`          | The index of the last character of the entity. (not inclusive) ~~int~~ |
+| `label`        | Label attached to the entity. ~~str~~                                  |
+| _Optional_     |                                                                        |
+| `kb_id`        | `KnowledgeBase` ID. ~~str~~                                            |
+| `kb_url`       | `KnowledgeBase` URL. ~~str~~                                           |
+
+</Accordion>
+
+> #### SPAN example data structure
+>
+> ```json
+> {
+>   "text": "Welcome to the Bank of China.",
+>   "spans": [
+>     { "start_token": 3, "end_token": 6, "label": "ORG" },
+>     { "start_token": 5, "end_token": 6, "label": "GPE" }
+>   ],
+>   "tokens": ["Welcome", "to", "the", "Bank", "of", "China", "."]
+> }
+> ```
+
+#### Span Classification data structure {id="structure-span"}
+
+| Dictionary Key | Description                                                                               |
+| -------------- | ----------------------------------------------------------------------------------------- |
+| `text`         | String representation of the document text. ~~str~~                                       |
+| `spans`        | List of dictionaries describing spans (see structure below). ~~List[Dict[str, Any]]~~     |
+| `tokens`       | List of word tokens. ~~List[str]~~                                                        |
+| _Optional_     |                                                                                           |
+| `title`        | Title of the visualization. ~~Optional[str]~~                                             |
+| `settings`     | Span Visualizer options (see [here](/api/top-level#displacy_options)). ~~Dict[str, Any]~~ |
+
+<Accordion title="Spans data structure">
+
+| Dictionary Key | Description                                                   |
+| -------------- | ------------------------------------------------------------- |
+| `start_token`  | The index of the first token of the span in `tokens`. ~~int~~ |
+| `end_token`    | The index of the last token of the span in `tokens`. ~~int~~  |
+| `label`        | Label attached to the span. ~~str~~                           |
+| _Optional_     |                                                               |
+| `kb_id`        | `KnowledgeBase` ID. ~~str~~                                   |
+| `kb_url`       | `KnowledgeBase` URL. ~~str~~                                  |
+
+</Accordion>
+
 ### Visualizer options {id="displacy_options"}
 
 The `options` argument lets you specify additional settings for each visualizer.

diff --git a/website/docs/usage/visualizers.mdx b/website/docs/usage/visualizers.mdx
@@ -349,7 +349,8 @@ or
 [SyntaxNet](https://github.com/tensorflow/models/tree/master/research/syntaxnet).
 If you set `manual=True` on either `render()` or `serve()`, you can pass in data
 in displaCy's format as a dictionary (instead of `Doc` objects). There are
-helper functions for converting `Doc` objects to displaCy's format for use with
+helper functions for converting `Doc` objects to
+[displaCy's format](/api/top-level#displacy_structures) for use with
 `manual=True`: [`displacy.parse_deps`](/api/top-level#displacy.parse_deps),
 [`displacy.parse_ents`](/api/top-level#displacy.parse_ents), and
 [`displacy.parse_spans`](/api/top-level#displacy.parse_spans).