Skip to content

Commit 008efad

Browse files
bjchambersefriis
andauthored
[community]: Render documents to graphviz (#24830)
- **Description:** Adds a helper that renders documents with the GraphVectorStore metadata fields to Graphviz for visualization. This is helpful for understanding and debugging. --------- Co-authored-by: Erick Friis <erick@langchain.dev>
1 parent fc80061 commit 008efad

File tree

3 files changed

+236
-0
lines changed

3 files changed

+236
-0
lines changed

libs/community/extended_testing_deps.txt

+1
Original file line numberDiff line numberDiff line change
@@ -29,6 +29,7 @@ gliner>=0.2.7
2929
google-cloud-documentai>=2.20.1,<3
3030
gql>=3.4.1,<4
3131
gradientai>=1.4.0,<2
32+
graphviz>=0.20.3,<0.21
3233
hdbcli>=2.19.21,<3
3334
hologres-vector==0.0.6
3435
html2text>=2020.1.16
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,122 @@
1+
import re
2+
from typing import TYPE_CHECKING, Dict, Iterable, Optional, Tuple
3+
4+
from langchain_core._api import beta
5+
from langchain_core.documents import Document
6+
7+
from langchain_community.graph_vectorstores.links import get_links
8+
9+
if TYPE_CHECKING:
10+
import graphviz
11+
12+
13+
def _escape_id(id: str) -> str:
14+
return id.replace(":", "_")
15+
16+
17+
_EDGE_DIRECTION = {
18+
"in": "back",
19+
"out": "forward",
20+
"bidir": "both",
21+
}
22+
23+
_WORD_RE = re.compile("\s*\S+")
24+
25+
26+
def _split_prefix(s: str, max_chars: int = 50) -> str:
27+
words = _WORD_RE.finditer(s)
28+
29+
split = min(len(s), max_chars)
30+
for word in words:
31+
if word.end(0) > max_chars:
32+
break
33+
split = word.end(0)
34+
35+
if split == len(s):
36+
return s
37+
else:
38+
return f"{s[0:split]}..."
39+
40+
41+
@beta()
42+
def render_graphviz(
43+
documents: Iterable[Document],
44+
engine: Optional[str] = None,
45+
node_color: Optional[str] = None,
46+
node_colors: Optional[Dict[str, Optional[str]]] = None,
47+
skip_tags: Iterable[Tuple[str, str]] = (),
48+
) -> "graphviz.Digraph":
49+
"""Render a collection of GraphVectorStore documents to GraphViz format.
50+
51+
Args:
52+
documents: The documents to render.
53+
engine: GraphViz layout engine to use. `None` uses the default.
54+
node_color: Default node color.
55+
node_colors: Dictionary specifying colors of specific nodes. Useful for
56+
emphasizing nodes that were selected by MMR, or differ from other
57+
results.
58+
skip_tags: Set of tags to skip when rendering the graph. Specified as
59+
tuples containing the kind and tag.
60+
61+
Returns:
62+
The "graphviz.Digraph" representing the nodes. May be printed to source,
63+
or rendered using `dot`.
64+
65+
Note:
66+
To render the generated DOT source code, you also need to install Graphviz_
67+
(`download page <https://www.graphviz.org/download/>`_,
68+
`archived versions <https://www2.graphviz.org/Archive/stable/>`_,
69+
`installation procedure for Windows <https://forum.graphviz.org/t/new-simplified-installation-procedure-on-windows/224>`_).
70+
"""
71+
if node_colors is None:
72+
node_colors = {}
73+
74+
try:
75+
import graphviz
76+
except (ImportError, ModuleNotFoundError):
77+
raise ImportError(
78+
"Could not import graphviz python package. "
79+
"Please install it with `pip install graphviz`."
80+
)
81+
82+
graph = graphviz.Digraph(engine=engine)
83+
graph.attr(rankdir="LR")
84+
graph.attr("node", style="filled")
85+
86+
skip_tags = set(skip_tags)
87+
tags: dict[Tuple[str, str], str] = {}
88+
89+
for document in documents:
90+
id = document.id
91+
if id is None:
92+
raise ValueError(f"Illegal graph document without ID: {document}")
93+
escaped_id = _escape_id(id)
94+
color = node_colors[id] if id in node_colors else node_color
95+
96+
node_label = "\n".join(
97+
[
98+
graphviz.escape(id),
99+
graphviz.escape(_split_prefix(document.page_content)),
100+
]
101+
)
102+
graph.node(
103+
escaped_id,
104+
label=node_label,
105+
shape="note",
106+
fillcolor=color,
107+
tooltip=graphviz.escape(document.page_content),
108+
)
109+
110+
for link in get_links(document):
111+
tag_key = (link.kind, link.tag)
112+
if tag_key in skip_tags:
113+
continue
114+
115+
tag_id = tags.get(tag_key)
116+
if tag_id is None:
117+
tag_id = f"tag_{len(tags)}"
118+
tags[tag_key] = tag_id
119+
graph.node(tag_id, label=graphviz.escape(f"{link.kind}:{link.tag}"))
120+
121+
graph.edge(escaped_id, tag_id, dir=_EDGE_DIRECTION[link.direction])
122+
return graph
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,113 @@
1+
import pytest
2+
from langchain_core.documents import Document
3+
4+
from langchain_community.graph_vectorstores.links import METADATA_LINKS_KEY, Link
5+
from langchain_community.graph_vectorstores.visualize import render_graphviz
6+
7+
8+
@pytest.mark.requires("graphviz")
9+
def test_visualize_simple_graph() -> None:
10+
doc1 = Document(
11+
id="a",
12+
page_content="some content",
13+
metadata={
14+
METADATA_LINKS_KEY: [
15+
Link.incoming("href", "a"),
16+
Link.bidir("kw", "foo"),
17+
]
18+
},
19+
)
20+
doc2 = Document(
21+
id="b",
22+
page_content="<some\n more content>",
23+
metadata={
24+
METADATA_LINKS_KEY: [
25+
Link.incoming("href", "b"),
26+
Link.outgoing("href", "a"),
27+
Link.bidir("kw", "foo"),
28+
Link.bidir("kw", "bar"),
29+
]
30+
},
31+
)
32+
33+
assert render_graphviz([doc1, doc2]).source == (
34+
"digraph {\n"
35+
"\trankdir=LR\n"
36+
"\tnode [style=filled]\n"
37+
'\ta [label="a\nsome content" shape=note tooltip="some content"]\n'
38+
'\ttag_0 [label="href:a"]\n'
39+
"\ta -> tag_0 [dir=back]\n"
40+
'\ttag_1 [label="kw:foo"]\n'
41+
"\ta -> tag_1 [dir=both]\n"
42+
'\tb [label="b\n<some\n more content>" '
43+
'shape=note tooltip="<some\n more content>"]\n'
44+
'\ttag_2 [label="href:b"]\n'
45+
"\tb -> tag_2 [dir=back]\n"
46+
"\tb -> tag_0 [dir=forward]\n"
47+
"\tb -> tag_1 [dir=both]\n"
48+
'\ttag_3 [label="kw:bar"]\n'
49+
"\tb -> tag_3 [dir=both]\n"
50+
"}\n"
51+
)
52+
53+
assert render_graphviz([doc1, doc2], engine="fdp").engine == "fdp"
54+
55+
assert render_graphviz([doc1, doc2], node_colors={"a": "gold"}).source == (
56+
"digraph {\n"
57+
"\trankdir=LR\n"
58+
"\tnode [style=filled]\n"
59+
'\ta [label="a\nsome content" fillcolor=gold '
60+
'shape=note tooltip="some content"]\n'
61+
'\ttag_0 [label="href:a"]\n'
62+
"\ta -> tag_0 [dir=back]\n"
63+
'\ttag_1 [label="kw:foo"]\n'
64+
"\ta -> tag_1 [dir=both]\n"
65+
'\tb [label="b\n<some\n more content>" '
66+
'shape=note tooltip="<some\n more content>"]\n'
67+
'\ttag_2 [label="href:b"]\n'
68+
"\tb -> tag_2 [dir=back]\n"
69+
"\tb -> tag_0 [dir=forward]\n"
70+
"\tb -> tag_1 [dir=both]\n"
71+
'\ttag_3 [label="kw:bar"]\n'
72+
"\tb -> tag_3 [dir=both]\n"
73+
"}\n"
74+
)
75+
76+
assert render_graphviz(
77+
[doc1, doc2], node_color="gold", node_colors={"a": None}
78+
).source == (
79+
"digraph {\n"
80+
"\trankdir=LR\n"
81+
"\tnode [style=filled]\n"
82+
'\ta [label="a\nsome content" shape=note tooltip="some content"]\n'
83+
'\ttag_0 [label="href:a"]\n'
84+
"\ta -> tag_0 [dir=back]\n"
85+
'\ttag_1 [label="kw:foo"]\n'
86+
"\ta -> tag_1 [dir=both]\n"
87+
'\tb [label="b\n<some\n more content>" fillcolor=gold '
88+
'shape=note tooltip="<some\n more content>"]\n'
89+
'\ttag_2 [label="href:b"]\n'
90+
"\tb -> tag_2 [dir=back]\n"
91+
"\tb -> tag_0 [dir=forward]\n"
92+
"\tb -> tag_1 [dir=both]\n"
93+
'\ttag_3 [label="kw:bar"]\n'
94+
"\tb -> tag_3 [dir=both]\n"
95+
"}\n"
96+
)
97+
98+
assert render_graphviz([doc1, doc2], skip_tags=[("kw", "foo")]).source == (
99+
"digraph {\n"
100+
"\trankdir=LR\n"
101+
"\tnode [style=filled]\n"
102+
'\ta [label="a\nsome content" shape=note tooltip="some content"]\n'
103+
'\ttag_0 [label="href:a"]\n'
104+
"\ta -> tag_0 [dir=back]\n"
105+
'\tb [label="b\n<some\n more content>" '
106+
'shape=note tooltip="<some\n more content>"]\n'
107+
'\ttag_1 [label="href:b"]\n'
108+
"\tb -> tag_1 [dir=back]\n"
109+
"\tb -> tag_0 [dir=forward]\n"
110+
'\ttag_2 [label="kw:bar"]\n'
111+
"\tb -> tag_2 [dir=both]\n"
112+
"}\n"
113+
)

0 commit comments

Comments
 (0)