fix: truncate long descriptions, sanitize content (#46)

mayurinehate · Mayuri N · web-flow · commit e75b891f6257 · 2025-09-04T13:48:20.000+05:30
* fix: truncate long descriptions, sanitize content

* add logging

---------

Co-authored-by: Mayuri N &lt;mayuri.nehate@datahub.com&gt;
diff --git a/pyproject.toml b/pyproject.toml
@@ -9,6 +9,7 @@ dependencies = [
     "asyncer>=0.0.8",
     "fastmcp==2.10.5",
     "jmespath~=1.0.1",
+    "loguru",
 ]
 license = "Apache-2.0"
 
diff --git a/src/mcp_server_datahub/mcp_server.py b/src/mcp_server_datahub/mcp_server.py
@@ -1,8 +1,10 @@
 import contextlib
 import contextvars
 import functools
+import html
 import inspect
 import pathlib
+import re
 from typing import (
     Any,
     Awaitable,
@@ -25,10 +27,85 @@
 from datahub.sdk.search_filters import Filter, FilterDsl, load_filters
 from datahub.utilities.ordered_set import OrderedSet
 from fastmcp import FastMCP
+from loguru import logger
 from pydantic import BaseModel
 
 _P = ParamSpec("_P")
 _R = TypeVar("_R")
+DESCRIPTION_LENGTH_HARD_LIMIT = 1000
+
+
+def sanitize_html_content(text: str) -> str:
+    """Remove HTML tags and decode HTML entities from text."""
+    if not text:
+        return text
+
+    # Remove HTML tags (including img tags)
+    text = re.sub(r"<[^>]+>", "", text)
+
+    # Decode HTML entities
+    text = html.unescape(text)
+
+    return text.strip()
+
+
+def truncate_with_ellipsis(text: str, max_length: int, suffix: str = "...") -> str:
+    """Truncate text to max_length and add suffix if truncated."""
+    if not text or len(text) <= max_length:
+        return text
+
+    # Account for suffix length
+    actual_max = max_length - len(suffix)
+    return text[:actual_max] + suffix
+
+
+def sanitize_markdown_content(text: str) -> str:
+    """Remove markdown-style embeds that contain encoded data from text, but preserve alt text."""
+    if not text:
+        return text
+
+    # Remove markdown embeds with data URLs (base64 encoded content) but preserve alt text
+    # Pattern: ![alt text](data:image/type;base64,encoded_data) -> alt text
+    text = re.sub(r"!\[([^\]]*)\]\(data:[^)]+\)", r"\1", text)
+
+    return text.strip()
+
+
+def sanitize_and_truncate_description(text: str, max_length: int) -> str:
+    """Sanitize HTML content and truncate to specified length."""
+    if not text:
+        return text
+
+    try:
+        # First sanitize HTML content
+        sanitized = sanitize_html_content(text)
+
+        # Then sanitize markdown content (preserving alt text)
+        sanitized = sanitize_markdown_content(sanitized)
+
+        # Then truncate if needed
+        return truncate_with_ellipsis(sanitized, max_length)
+    except Exception as e:
+        logger.warning(f"Error sanitizing and truncating description: {e}")
+        return text[:max_length] if len(text) > max_length else text
+
+
+def truncate_descriptions(
+    data: dict | list, max_length: int = DESCRIPTION_LENGTH_HARD_LIMIT
+) -> None:
+    """
+    Recursively truncates values of keys named 'description' in a dictionary in place.
+    """
+    # TODO: path-aware truncate, for different length limits per entity type
+    if isinstance(data, dict):
+        for key, value in data.items():
+            if key == "description" and isinstance(value, str):
+                data[key] = sanitize_and_truncate_description(value, max_length)
+            elif isinstance(value, (dict, list)):
+                truncate_descriptions(value)
+    elif isinstance(data, list):
+        for item in data:
+            truncate_descriptions(item)
 
 
 # See https://github.com/jlowin/fastmcp/issues/864#issuecomment-3103678258
@@ -192,6 +269,7 @@ def get_entity(urn: str) -> dict:
     )["entity"]
 
     inject_urls_for_urns(client._graph, result, [""])
+    truncate_descriptions(result)
 
     return clean_get_entity_response(result)
 
@@ -440,4 +518,5 @@ def get_lineage(
     )
     lineage = lineage_api.get_lineage(asset_lineage_directive)
     inject_urls_for_urns(client._graph, lineage, ["*.searchResults[].entity"])
+    truncate_descriptions(lineage)
     return lineage
diff --git a/tests/conftest.py b/tests/conftest.py
@@ -5,6 +5,6 @@
 os.environ["DATAHUB_TELEMETRY_ENABLED"] = "false"
 
 
-@pytest.fixture
+@pytest.fixture(scope="module")
 def anyio_backend() -> str:
     return "asyncio"
diff --git a/tests/test_mcp_server_helpers.py b/tests/test_mcp_server_helpers.py
@@ -5,11 +5,12 @@
     maybe_convert_to_schema_field_urn,
     clean_gql_response,
     clean_get_entity_response,
+    truncate_descriptions,
 )
 from datahub.ingestion.graph.links import make_url_for_urn
 
 
-def test_inject_urls_for_urns():
+def test_inject_urls_for_urns() -> None:
     mock_graph = Mock()
     mock_graph.url_for.side_effect = lambda urn: make_url_for_urn(
         "https://xyz.com", urn
@@ -58,7 +59,7 @@ def test_inject_urls_for_urns():
         assert mock_graph.url_for.call_count == 2
 
 
-def test_maybe_convert_to_schema_field_urn_with_column():
+def test_maybe_convert_to_schema_field_urn_with_column() -> None:
     dataset_urn = "urn:li:dataset:(urn:li:dataPlatform:snowflake,analytics_db.raw_schema.users,PROD)"
     column = "user_id"
 
@@ -70,15 +71,15 @@ def test_maybe_convert_to_schema_field_urn_with_column():
     )
 
 
-def test_maybe_convert_to_schema_field_urn_without_column():
+def test_maybe_convert_to_schema_field_urn_without_column() -> None:
     original_urn = "urn:li:dataset:(urn:li:dataPlatform:snowflake,analytics_db.raw_schema.users,PROD)"
 
     result = maybe_convert_to_schema_field_urn(original_urn, None)
 
     assert result == original_urn
 
 
-def test_maybe_convert_to_schema_field_urn_with_incorrect_entity():
+def test_maybe_convert_to_schema_field_urn_with_incorrect_entity() -> None:
     chart_urn = "urn:li:chart:(looker,baz)"
 
     # Ok if no column is provided
@@ -91,8 +92,8 @@ def test_maybe_convert_to_schema_field_urn_with_incorrect_entity():
         maybe_convert_to_schema_field_urn(chart_urn, column)
 
 
-def test_clean_gql_response_with_dict():
-    response = {
+def test_clean_gql_response_with_dict() -> None:
+    response: dict = {
         "__typename": "Dataset",
         "urn": "urn:li:dataset:(urn:li:dataPlatform:snowflake,analytics_db.raw_schema.users,PROD)",
         "name": "users",
@@ -112,7 +113,7 @@ def test_clean_gql_response_with_dict():
     assert result == expected_result
 
 
-def test_clean_gql_response_with_nested_empty_objects():
+def test_clean_gql_response_with_nested_empty_objects() -> None:
     response = {
         "urn": "urn:li:dataset:(urn:li:dataPlatform:snowflake,analytics_db.raw_schema.users,PROD)",
         "name": "users",
@@ -141,7 +142,7 @@ def test_clean_gql_response_with_nested_empty_objects():
     assert result == expected_result
 
 
-def test_clean_get_entity_response_with_schema_metadata():
+def test_clean_get_entity_response_with_schema_metadata() -> None:
     raw_response = {
         "urn": "urn:li:dataset:(urn:li:dataPlatform:snowflake,analytics_db.raw_schema.users,PROD)",
         "name": "users",
@@ -195,3 +196,57 @@ def test_clean_get_entity_response_with_schema_metadata():
     }
 
     assert result == expected_result
+
+
+def test_truncate_descriptions() -> None:
+    result = {
+        "downstreams": {
+            "searchResults": [
+                {
+                    "entity": {
+                        "description": "Description with ![image](data:image/png;base64,encoded_data) and more content that exceeds the limit",
+                        "properties": {
+                            "description": "Description with image <img src='data:image/png;base64,encoded_data' /> and more content that exceeds the limit"
+                        },
+                        "fields": [
+                            {
+                                "fieldPath": "description",
+                                "description": "Description with image <img src='data:image/png;base64,encoded_data' /> and more content that exceeds the limit",
+                            },
+                            {
+                                "fieldPath": "description",
+                                "description": "Simple description",
+                            },
+                        ],
+                    }
+                }
+            ]
+        }
+    }
+
+    truncate_descriptions(result, 50)
+
+    assert result == {
+        "downstreams": {
+            "searchResults": [
+                {
+                    "entity": {
+                        "description": "Description with image and more content that exceeds the limit",
+                        "properties": {
+                            "description": "Description with image  and more content that exceeds the limit"
+                        },
+                        "fields": [
+                            {
+                                "fieldPath": "description",
+                                "description": "Description with image  and more content that exceeds the limit",
+                            },
+                            {
+                                "fieldPath": "description",
+                                "description": "Simple description",
+                            },
+                        ],
+                    }
+                }
+            ]
+        }
+    }
diff --git a/uv.lock b/uv.lock

Original file line number	Diff line number	Diff line change
`@@ -9,6 +9,7 @@ dependencies = [`
`9`	`9`	`"asyncer>=0.0.8",`
`10`	`10`	`"fastmcp==2.10.5",`
`11`	`11`	`"jmespath~=1.0.1",`
	`12`	`+ "loguru",`
`12`	`13`	`]`
`13`	`14`	`license = "Apache-2.0"`
`14`	`15`