diff --git a/api/core/workflow/nodes/http_request/entities.py b/api/core/workflow/nodes/http_request/entities.py index 36ded104c16a66..5e39ef79d139bf 100644 --- a/api/core/workflow/nodes/http_request/entities.py +++ b/api/core/workflow/nodes/http_request/entities.py @@ -1,4 +1,6 @@ +import mimetypes from collections.abc import Sequence +from email.message import Message from typing import Any, Literal, Optional import httpx @@ -7,14 +9,6 @@ from configs import dify_config from core.workflow.nodes.base import BaseNodeData -NON_FILE_CONTENT_TYPES = ( - "application/json", - "application/xml", - "text/html", - "text/plain", - "application/x-www-form-urlencoded", -) - class HttpRequestNodeAuthorizationConfig(BaseModel): type: Literal["basic", "bearer", "custom"] @@ -93,13 +87,53 @@ def __init__(self, response: httpx.Response): @property def is_file(self): - content_type = self.content_type + """ + Determine if the response contains a file by checking: + 1. Content-Disposition header (RFC 6266) + 2. Content characteristics + 3. MIME type analysis + """ + content_type = self.content_type.split(";")[0].strip().lower() content_disposition = self.response.headers.get("content-disposition", "") - return "attachment" in content_disposition or ( - not any(non_file in content_type for non_file in NON_FILE_CONTENT_TYPES) - and any(file_type in content_type for file_type in ("application/", "image/", "audio/", "video/")) - ) + # Check if it's explicitly marked as an attachment + if content_disposition: + msg = Message() + msg["content-disposition"] = content_disposition + disp_type = msg.get_content_disposition() # Returns 'attachment', 'inline', or None + filename = msg.get_filename() # Returns filename if present, None otherwise + if disp_type == "attachment" or filename is not None: + return True + + # For application types, try to detect if it's a text-based format + if content_type.startswith("application/"): + # Common text-based application types + if any( + text_type in content_type + for text_type in ("json", "xml", "javascript", "x-www-form-urlencoded", "yaml", "graphql") + ): + return False + + # Try to detect if content is text-based by sampling first few bytes + try: + # Sample first 1024 bytes for text detection + content_sample = self.response.content[:1024] + content_sample.decode("utf-8") + # If we can decode as UTF-8 and find common text patterns, likely not a file + text_markers = (b"{", b"[", b"<", b"function", b"var ", b"const ", b"let ") + if any(marker in content_sample for marker in text_markers): + return False + except UnicodeDecodeError: + # If we can't decode as UTF-8, likely a binary file + return True + + # For other types, use MIME type analysis + main_type, _ = mimetypes.guess_type("dummy" + (mimetypes.guess_extension(content_type) or "")) + if main_type: + return main_type.split("/")[0] in ("application", "image", "audio", "video") + + # For unknown types, check if it's a media type + return any(media_type in content_type for media_type in ("image/", "audio/", "video/")) @property def content_type(self) -> str: diff --git a/api/tests/unit_tests/core/workflow/nodes/http_request/test_entities.py b/api/tests/unit_tests/core/workflow/nodes/http_request/test_entities.py new file mode 100644 index 00000000000000..0f6b7e4ab6c1fd --- /dev/null +++ b/api/tests/unit_tests/core/workflow/nodes/http_request/test_entities.py @@ -0,0 +1,140 @@ +from unittest.mock import Mock, PropertyMock, patch + +import httpx +import pytest + +from core.workflow.nodes.http_request.entities import Response + + +@pytest.fixture +def mock_response(): + response = Mock(spec=httpx.Response) + response.headers = {} + return response + + +def test_is_file_with_attachment_disposition(mock_response): + """Test is_file when content-disposition header contains 'attachment'""" + mock_response.headers = {"content-disposition": "attachment; filename=test.pdf", "content-type": "application/pdf"} + response = Response(mock_response) + assert response.is_file + + +def test_is_file_with_filename_disposition(mock_response): + """Test is_file when content-disposition header contains filename parameter""" + mock_response.headers = {"content-disposition": "inline; filename=test.pdf", "content-type": "application/pdf"} + response = Response(mock_response) + assert response.is_file + + +@pytest.mark.parametrize("content_type", ["application/pdf", "image/jpeg", "audio/mp3", "video/mp4"]) +def test_is_file_with_file_content_types(mock_response, content_type): + """Test is_file with various file content types""" + mock_response.headers = {"content-type": content_type} + # Mock binary content + type(mock_response).content = PropertyMock(return_value=bytes([0x00, 0xFF] * 512)) + response = Response(mock_response) + assert response.is_file, f"Content type {content_type} should be identified as a file" + + +@pytest.mark.parametrize( + "content_type", + [ + "application/json", + "application/xml", + "application/javascript", + "application/x-www-form-urlencoded", + "application/yaml", + "application/graphql", + ], +) +def test_text_based_application_types(mock_response, content_type): + """Test common text-based application types are not identified as files""" + mock_response.headers = {"content-type": content_type} + response = Response(mock_response) + assert not response.is_file, f"Content type {content_type} should not be identified as a file" + + +@pytest.mark.parametrize( + ("content", "content_type"), + [ + (b'{"key": "value"}', "application/octet-stream"), + (b"[1, 2, 3]", "application/unknown"), + (b"function test() {}", "application/x-unknown"), + (b"test", "application/binary"), + (b"var x = 1;", "application/data"), + ], +) +def test_content_based_detection(mock_response, content, content_type): + """Test content-based detection for text-like content""" + mock_response.headers = {"content-type": content_type} + type(mock_response).content = PropertyMock(return_value=content) + response = Response(mock_response) + assert not response.is_file, f"Content {content} with type {content_type} should not be identified as a file" + + +@pytest.mark.parametrize( + ("content", "content_type"), + [ + (bytes([0x00, 0xFF] * 512), "application/octet-stream"), + (bytes([0x89, 0x50, 0x4E, 0x47]), "application/unknown"), # PNG magic numbers + (bytes([0xFF, 0xD8, 0xFF]), "application/binary"), # JPEG magic numbers + ], +) +def test_binary_content_detection(mock_response, content, content_type): + """Test content-based detection for binary content""" + mock_response.headers = {"content-type": content_type} + type(mock_response).content = PropertyMock(return_value=content) + response = Response(mock_response) + assert response.is_file, f"Binary content with type {content_type} should be identified as a file" + + +@pytest.mark.parametrize( + ("content_type", "expected_main_type"), + [ + ("x-world/x-vrml", "model"), # VRML 3D model + ("font/ttf", "application"), # TrueType font + ("text/csv", "text"), # CSV text file + ("unknown/xyz", None), # Unknown type + ], +) +def test_mimetype_based_detection(mock_response, content_type, expected_main_type): + """Test detection using mimetypes.guess_type for non-application content types""" + mock_response.headers = {"content-type": content_type} + type(mock_response).content = PropertyMock(return_value=bytes([0x00])) # Dummy content + + with patch("core.workflow.nodes.http_request.entities.mimetypes.guess_type") as mock_guess_type: + # Mock the return value based on expected_main_type + if expected_main_type: + mock_guess_type.return_value = (f"{expected_main_type}/subtype", None) + else: + mock_guess_type.return_value = (None, None) + + response = Response(mock_response) + + # Check if the result matches our expectation + if expected_main_type in ("application", "image", "audio", "video"): + assert response.is_file, f"Content type {content_type} should be identified as a file" + else: + assert not response.is_file, f"Content type {content_type} should not be identified as a file" + + # Verify that guess_type was called + mock_guess_type.assert_called_once() + + +def test_is_file_with_inline_disposition(mock_response): + """Test is_file when content-disposition is 'inline'""" + mock_response.headers = {"content-disposition": "inline", "content-type": "application/pdf"} + # Mock binary content + type(mock_response).content = PropertyMock(return_value=bytes([0x00, 0xFF] * 512)) + response = Response(mock_response) + assert response.is_file + + +def test_is_file_with_no_content_disposition(mock_response): + """Test is_file when no content-disposition header is present""" + mock_response.headers = {"content-type": "application/pdf"} + # Mock binary content + type(mock_response).content = PropertyMock(return_value=bytes([0x00, 0xFF] * 512)) + response = Response(mock_response) + assert response.is_file