Skip to content

Commit e75b891

Browse files
mayurinehateMayuri N
andauthored
fix: truncate long descriptions, sanitize content (#46)
* fix: truncate long descriptions, sanitize content * add logging --------- Co-authored-by: Mayuri N <mayuri.nehate@datahub.com>
1 parent aa94432 commit e75b891

File tree

5 files changed

+169
-10
lines changed

5 files changed

+169
-10
lines changed

pyproject.toml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,7 @@ dependencies = [
99
"asyncer>=0.0.8",
1010
"fastmcp==2.10.5",
1111
"jmespath~=1.0.1",
12+
"loguru",
1213
]
1314
license = "Apache-2.0"
1415

src/mcp_server_datahub/mcp_server.py

Lines changed: 79 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,10 @@
11
import contextlib
22
import contextvars
33
import functools
4+
import html
45
import inspect
56
import pathlib
7+
import re
68
from typing import (
79
Any,
810
Awaitable,
@@ -25,10 +27,85 @@
2527
from datahub.sdk.search_filters import Filter, FilterDsl, load_filters
2628
from datahub.utilities.ordered_set import OrderedSet
2729
from fastmcp import FastMCP
30+
from loguru import logger
2831
from pydantic import BaseModel
2932

3033
_P = ParamSpec("_P")
3134
_R = TypeVar("_R")
35+
DESCRIPTION_LENGTH_HARD_LIMIT = 1000
36+
37+
38+
def sanitize_html_content(text: str) -> str:
39+
"""Remove HTML tags and decode HTML entities from text."""
40+
if not text:
41+
return text
42+
43+
# Remove HTML tags (including img tags)
44+
text = re.sub(r"<[^>]+>", "", text)
45+
46+
# Decode HTML entities
47+
text = html.unescape(text)
48+
49+
return text.strip()
50+
51+
52+
def truncate_with_ellipsis(text: str, max_length: int, suffix: str = "...") -> str:
53+
"""Truncate text to max_length and add suffix if truncated."""
54+
if not text or len(text) <= max_length:
55+
return text
56+
57+
# Account for suffix length
58+
actual_max = max_length - len(suffix)
59+
return text[:actual_max] + suffix
60+
61+
62+
def sanitize_markdown_content(text: str) -> str:
63+
"""Remove markdown-style embeds that contain encoded data from text, but preserve alt text."""
64+
if not text:
65+
return text
66+
67+
# Remove markdown embeds with data URLs (base64 encoded content) but preserve alt text
68+
# Pattern: ![alt text](_data) -> alt text
69+
text = re.sub(r"!\[([^\]]*)\]\(data:[^)]+\)", r"\1", text)
70+
71+
return text.strip()
72+
73+
74+
def sanitize_and_truncate_description(text: str, max_length: int) -> str:
75+
"""Sanitize HTML content and truncate to specified length."""
76+
if not text:
77+
return text
78+
79+
try:
80+
# First sanitize HTML content
81+
sanitized = sanitize_html_content(text)
82+
83+
# Then sanitize markdown content (preserving alt text)
84+
sanitized = sanitize_markdown_content(sanitized)
85+
86+
# Then truncate if needed
87+
return truncate_with_ellipsis(sanitized, max_length)
88+
except Exception as e:
89+
logger.warning(f"Error sanitizing and truncating description: {e}")
90+
return text[:max_length] if len(text) > max_length else text
91+
92+
93+
def truncate_descriptions(
94+
data: dict | list, max_length: int = DESCRIPTION_LENGTH_HARD_LIMIT
95+
) -> None:
96+
"""
97+
Recursively truncates values of keys named 'description' in a dictionary in place.
98+
"""
99+
# TODO: path-aware truncate, for different length limits per entity type
100+
if isinstance(data, dict):
101+
for key, value in data.items():
102+
if key == "description" and isinstance(value, str):
103+
data[key] = sanitize_and_truncate_description(value, max_length)
104+
elif isinstance(value, (dict, list)):
105+
truncate_descriptions(value)
106+
elif isinstance(data, list):
107+
for item in data:
108+
truncate_descriptions(item)
32109

33110

34111
# See https://github.com/jlowin/fastmcp/issues/864#issuecomment-3103678258
@@ -192,6 +269,7 @@ def get_entity(urn: str) -> dict:
192269
)["entity"]
193270

194271
inject_urls_for_urns(client._graph, result, [""])
272+
truncate_descriptions(result)
195273

196274
return clean_get_entity_response(result)
197275

@@ -440,4 +518,5 @@ def get_lineage(
440518
)
441519
lineage = lineage_api.get_lineage(asset_lineage_directive)
442520
inject_urls_for_urns(client._graph, lineage, ["*.searchResults[].entity"])
521+
truncate_descriptions(lineage)
443522
return lineage

tests/conftest.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,6 @@
55
os.environ["DATAHUB_TELEMETRY_ENABLED"] = "false"
66

77

8-
@pytest.fixture
8+
@pytest.fixture(scope="module")
99
def anyio_backend() -> str:
1010
return "asyncio"

tests/test_mcp_server_helpers.py

Lines changed: 63 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -5,11 +5,12 @@
55
maybe_convert_to_schema_field_urn,
66
clean_gql_response,
77
clean_get_entity_response,
8+
truncate_descriptions,
89
)
910
from datahub.ingestion.graph.links import make_url_for_urn
1011

1112

12-
def test_inject_urls_for_urns():
13+
def test_inject_urls_for_urns() -> None:
1314
mock_graph = Mock()
1415
mock_graph.url_for.side_effect = lambda urn: make_url_for_urn(
1516
"https://xyz.com", urn
@@ -58,7 +59,7 @@ def test_inject_urls_for_urns():
5859
assert mock_graph.url_for.call_count == 2
5960

6061

61-
def test_maybe_convert_to_schema_field_urn_with_column():
62+
def test_maybe_convert_to_schema_field_urn_with_column() -> None:
6263
dataset_urn = "urn:li:dataset:(urn:li:dataPlatform:snowflake,analytics_db.raw_schema.users,PROD)"
6364
column = "user_id"
6465

@@ -70,15 +71,15 @@ def test_maybe_convert_to_schema_field_urn_with_column():
7071
)
7172

7273

73-
def test_maybe_convert_to_schema_field_urn_without_column():
74+
def test_maybe_convert_to_schema_field_urn_without_column() -> None:
7475
original_urn = "urn:li:dataset:(urn:li:dataPlatform:snowflake,analytics_db.raw_schema.users,PROD)"
7576

7677
result = maybe_convert_to_schema_field_urn(original_urn, None)
7778

7879
assert result == original_urn
7980

8081

81-
def test_maybe_convert_to_schema_field_urn_with_incorrect_entity():
82+
def test_maybe_convert_to_schema_field_urn_with_incorrect_entity() -> None:
8283
chart_urn = "urn:li:chart:(looker,baz)"
8384

8485
# Ok if no column is provided
@@ -91,8 +92,8 @@ def test_maybe_convert_to_schema_field_urn_with_incorrect_entity():
9192
maybe_convert_to_schema_field_urn(chart_urn, column)
9293

9394

94-
def test_clean_gql_response_with_dict():
95-
response = {
95+
def test_clean_gql_response_with_dict() -> None:
96+
response: dict = {
9697
"__typename": "Dataset",
9798
"urn": "urn:li:dataset:(urn:li:dataPlatform:snowflake,analytics_db.raw_schema.users,PROD)",
9899
"name": "users",
@@ -112,7 +113,7 @@ def test_clean_gql_response_with_dict():
112113
assert result == expected_result
113114

114115

115-
def test_clean_gql_response_with_nested_empty_objects():
116+
def test_clean_gql_response_with_nested_empty_objects() -> None:
116117
response = {
117118
"urn": "urn:li:dataset:(urn:li:dataPlatform:snowflake,analytics_db.raw_schema.users,PROD)",
118119
"name": "users",
@@ -141,7 +142,7 @@ def test_clean_gql_response_with_nested_empty_objects():
141142
assert result == expected_result
142143

143144

144-
def test_clean_get_entity_response_with_schema_metadata():
145+
def test_clean_get_entity_response_with_schema_metadata() -> None:
145146
raw_response = {
146147
"urn": "urn:li:dataset:(urn:li:dataPlatform:snowflake,analytics_db.raw_schema.users,PROD)",
147148
"name": "users",
@@ -195,3 +196,57 @@ def test_clean_get_entity_response_with_schema_metadata():
195196
}
196197

197198
assert result == expected_result
199+
200+
201+
def test_truncate_descriptions() -> None:
202+
result = {
203+
"downstreams": {
204+
"searchResults": [
205+
{
206+
"entity": {
207+
"description": "Description with ![image](_data) and more content that exceeds the limit",
208+
"properties": {
209+
"description": "Description with image <img src='_data' /> and more content that exceeds the limit"
210+
},
211+
"fields": [
212+
{
213+
"fieldPath": "description",
214+
"description": "Description with image <img src='_data' /> and more content that exceeds the limit",
215+
},
216+
{
217+
"fieldPath": "description",
218+
"description": "Simple description",
219+
},
220+
],
221+
}
222+
}
223+
]
224+
}
225+
}
226+
227+
truncate_descriptions(result, 50)
228+
229+
assert result == {
230+
"downstreams": {
231+
"searchResults": [
232+
{
233+
"entity": {
234+
"description": "Description with image and more content that exceeds the limit",
235+
"properties": {
236+
"description": "Description with image and more content that exceeds the limit"
237+
},
238+
"fields": [
239+
{
240+
"fieldPath": "description",
241+
"description": "Description with image and more content that exceeds the limit",
242+
},
243+
{
244+
"fieldPath": "description",
245+
"description": "Simple description",
246+
},
247+
],
248+
}
249+
}
250+
]
251+
}
252+
}

uv.lock

Lines changed: 25 additions & 1 deletion
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

0 commit comments

Comments
 (0)