Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

fix(http_request): allow content type application/x-javascript #10862

Merged
merged 3 commits into from
Nov 20, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
60 changes: 47 additions & 13 deletions api/core/workflow/nodes/http_request/entities.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,6 @@
import mimetypes
from collections.abc import Sequence
from email.message import Message
from typing import Any, Literal, Optional

import httpx
Expand All @@ -7,14 +9,6 @@
from configs import dify_config
from core.workflow.nodes.base import BaseNodeData

NON_FILE_CONTENT_TYPES = (
"application/json",
"application/xml",
"text/html",
"text/plain",
"application/x-www-form-urlencoded",
)


class HttpRequestNodeAuthorizationConfig(BaseModel):
type: Literal["basic", "bearer", "custom"]
Expand Down Expand Up @@ -93,13 +87,53 @@ def __init__(self, response: httpx.Response):

@property
def is_file(self):
content_type = self.content_type
"""
Determine if the response contains a file by checking:
1. Content-Disposition header (RFC 6266)
2. Content characteristics
3. MIME type analysis
"""
content_type = self.content_type.split(";")[0].strip().lower()
content_disposition = self.response.headers.get("content-disposition", "")

return "attachment" in content_disposition or (
not any(non_file in content_type for non_file in NON_FILE_CONTENT_TYPES)
and any(file_type in content_type for file_type in ("application/", "image/", "audio/", "video/"))
)
# Check if it's explicitly marked as an attachment
if content_disposition:
msg = Message()
msg["content-disposition"] = content_disposition
disp_type = msg.get_content_disposition() # Returns 'attachment', 'inline', or None
filename = msg.get_filename() # Returns filename if present, None otherwise
if disp_type == "attachment" or filename is not None:
return True

# For application types, try to detect if it's a text-based format
if content_type.startswith("application/"):
# Common text-based application types
if any(
text_type in content_type
for text_type in ("json", "xml", "javascript", "x-www-form-urlencoded", "yaml", "graphql")
):
return False

# Try to detect if content is text-based by sampling first few bytes
try:
# Sample first 1024 bytes for text detection
content_sample = self.response.content[:1024]
content_sample.decode("utf-8")
# If we can decode as UTF-8 and find common text patterns, likely not a file
text_markers = (b"{", b"[", b"<", b"function", b"var ", b"const ", b"let ")
if any(marker in content_sample for marker in text_markers):
return False
except UnicodeDecodeError:
# If we can't decode as UTF-8, likely a binary file
return True

# For other types, use MIME type analysis
main_type, _ = mimetypes.guess_type("dummy" + (mimetypes.guess_extension(content_type) or ""))
if main_type:
return main_type.split("/")[0] in ("application", "image", "audio", "video")

# For unknown types, check if it's a media type
return any(media_type in content_type for media_type in ("image/", "audio/", "video/"))

@property
def content_type(self) -> str:
Expand Down
140 changes: 140 additions & 0 deletions api/tests/unit_tests/core/workflow/nodes/http_request/test_entities.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,140 @@
from unittest.mock import Mock, PropertyMock, patch

import httpx
import pytest

from core.workflow.nodes.http_request.entities import Response


@pytest.fixture
def mock_response():
response = Mock(spec=httpx.Response)
response.headers = {}
return response


def test_is_file_with_attachment_disposition(mock_response):
"""Test is_file when content-disposition header contains 'attachment'"""
mock_response.headers = {"content-disposition": "attachment; filename=test.pdf", "content-type": "application/pdf"}
response = Response(mock_response)
assert response.is_file


def test_is_file_with_filename_disposition(mock_response):
"""Test is_file when content-disposition header contains filename parameter"""
mock_response.headers = {"content-disposition": "inline; filename=test.pdf", "content-type": "application/pdf"}
response = Response(mock_response)
assert response.is_file


@pytest.mark.parametrize("content_type", ["application/pdf", "image/jpeg", "audio/mp3", "video/mp4"])
def test_is_file_with_file_content_types(mock_response, content_type):
"""Test is_file with various file content types"""
mock_response.headers = {"content-type": content_type}
# Mock binary content
type(mock_response).content = PropertyMock(return_value=bytes([0x00, 0xFF] * 512))
response = Response(mock_response)
assert response.is_file, f"Content type {content_type} should be identified as a file"


@pytest.mark.parametrize(
"content_type",
[
"application/json",
"application/xml",
"application/javascript",
"application/x-www-form-urlencoded",
"application/yaml",
"application/graphql",
],
)
def test_text_based_application_types(mock_response, content_type):
"""Test common text-based application types are not identified as files"""
mock_response.headers = {"content-type": content_type}
response = Response(mock_response)
assert not response.is_file, f"Content type {content_type} should not be identified as a file"


@pytest.mark.parametrize(
("content", "content_type"),
[
(b'{"key": "value"}', "application/octet-stream"),
(b"[1, 2, 3]", "application/unknown"),
(b"function test() {}", "application/x-unknown"),
(b"<root>test</root>", "application/binary"),
(b"var x = 1;", "application/data"),
],
)
def test_content_based_detection(mock_response, content, content_type):
"""Test content-based detection for text-like content"""
mock_response.headers = {"content-type": content_type}
type(mock_response).content = PropertyMock(return_value=content)
response = Response(mock_response)
assert not response.is_file, f"Content {content} with type {content_type} should not be identified as a file"


@pytest.mark.parametrize(
("content", "content_type"),
[
(bytes([0x00, 0xFF] * 512), "application/octet-stream"),
(bytes([0x89, 0x50, 0x4E, 0x47]), "application/unknown"), # PNG magic numbers
(bytes([0xFF, 0xD8, 0xFF]), "application/binary"), # JPEG magic numbers
],
)
def test_binary_content_detection(mock_response, content, content_type):
"""Test content-based detection for binary content"""
mock_response.headers = {"content-type": content_type}
type(mock_response).content = PropertyMock(return_value=content)
response = Response(mock_response)
assert response.is_file, f"Binary content with type {content_type} should be identified as a file"


@pytest.mark.parametrize(
("content_type", "expected_main_type"),
[
("x-world/x-vrml", "model"), # VRML 3D model
("font/ttf", "application"), # TrueType font
("text/csv", "text"), # CSV text file
("unknown/xyz", None), # Unknown type
],
)
def test_mimetype_based_detection(mock_response, content_type, expected_main_type):
"""Test detection using mimetypes.guess_type for non-application content types"""
mock_response.headers = {"content-type": content_type}
type(mock_response).content = PropertyMock(return_value=bytes([0x00])) # Dummy content

with patch("core.workflow.nodes.http_request.entities.mimetypes.guess_type") as mock_guess_type:
# Mock the return value based on expected_main_type
if expected_main_type:
mock_guess_type.return_value = (f"{expected_main_type}/subtype", None)
else:
mock_guess_type.return_value = (None, None)

response = Response(mock_response)

# Check if the result matches our expectation
if expected_main_type in ("application", "image", "audio", "video"):
assert response.is_file, f"Content type {content_type} should be identified as a file"
else:
assert not response.is_file, f"Content type {content_type} should not be identified as a file"

# Verify that guess_type was called
mock_guess_type.assert_called_once()


def test_is_file_with_inline_disposition(mock_response):
"""Test is_file when content-disposition is 'inline'"""
mock_response.headers = {"content-disposition": "inline", "content-type": "application/pdf"}
# Mock binary content
type(mock_response).content = PropertyMock(return_value=bytes([0x00, 0xFF] * 512))
response = Response(mock_response)
assert response.is_file


def test_is_file_with_no_content_disposition(mock_response):
"""Test is_file when no content-disposition header is present"""
mock_response.headers = {"content-type": "application/pdf"}
# Mock binary content
type(mock_response).content = PropertyMock(return_value=bytes([0x00, 0xFF] * 512))
response = Response(mock_response)
assert response.is_file