diff --git a/CHANGELOG.md b/CHANGELOG.md index 77521a652..923638187 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -5,6 +5,10 @@ Write the date in place of the "Unreleased" in the case a new version is release ## Unreleased +### Added +- Support partial download of an asset using the + [HTTP `Range` Header](https://developer.mozilla.org/en-US/docs/Web/HTTP/Headers/Range). + ### Fixed - When authenticated as a Service Principal, display the SP's uuid in the client Context repr. diff --git a/tiled/_tests/test_asset_access.py b/tiled/_tests/test_asset_access.py index 6a119acfd..914ed8bf4 100644 --- a/tiled/_tests/test_asset_access.py +++ b/tiled/_tests/test_asset_access.py @@ -1,8 +1,13 @@ import hashlib from pathlib import Path +import pandas import pytest -from starlette.status import HTTP_403_FORBIDDEN +from starlette.status import ( + HTTP_400_BAD_REQUEST, + HTTP_403_FORBIDDEN, + HTTP_416_REQUESTED_RANGE_NOT_SATISFIABLE, +) from ..catalog import in_memory from ..client import Context, from_context @@ -66,6 +71,44 @@ def test_raw_export(client, tmpdir): assert orig_hashes == exported_hashes +def test_asset_range_request(client, tmpdir): + "Access part of an asset using an HTTP Range header." + df = pandas.DataFrame({"A": [1, 2, 3], "B": [4.0, 5.0, 6.0]}) + client.write_dataframe(df, key="x") + # Fetch the first byte. + first_byte_response = client.context.http_client.get( + "/api/v1/asset/bytes/x?id=1", + headers={"Range": "bytes=0-0"}, + ) + assert first_byte_response.content == b"P" + # Fetch the first two bytes. + first_two_bytes_response = client.context.http_client.get( + "/api/v1/asset/bytes/x?id=1", + headers={"Range": "bytes=0-1"}, + ) + assert first_two_bytes_response.content == b"PA" + # Fetch the second two bytes. + second_two_bytes_response = client.context.http_client.get( + "/api/v1/asset/bytes/x?id=1", + headers={"Range": "bytes=2-3"}, + ) + assert second_two_bytes_response.content == b"R1" + # Request outside of range + out_of_range_response = client.context.http_client.get( + "/api/v1/asset/bytes/x?id=1", + headers={"Range": "bytes=1000000-100000000"}, + ) + with fail_with_status_code(HTTP_416_REQUESTED_RANGE_NOT_SATISFIABLE): + out_of_range_response.raise_for_status() + # Request malformed range + malformed_response = client.context.http_client.get( + "/api/v1/asset/bytes/x?id=1", + headers={"Range": "bytes=abc"}, + ) + with fail_with_status_code(HTTP_400_BAD_REQUEST): + malformed_response.raise_for_status() + + def test_get_asset_filepaths(client): "Smoke test get_asset_filepaths." client.write_array([1, 2, 3], key="x") diff --git a/tiled/server/file_response_with_range.py b/tiled/server/file_response_with_range.py new file mode 100644 index 000000000..addeda7f3 --- /dev/null +++ b/tiled/server/file_response_with_range.py @@ -0,0 +1,105 @@ +# This is a variation on starlette's FileRespones that adds support for the +# 'Range' HTTP header. + +# It is adapted from a closed PR in starlette which was reviewed by a core +# starlette maintainer but put aside for now in favor of other priorities in +# starlette development. Thus, we implement it here in tiled. If in the future +# starlette adds support upstream, we should consider refactoring to use that. + +# Ref: https://github.com/encode/starlette/pull/1999 +import os +import stat +import typing + +import anyio +from starlette.responses import ( + FileResponse, + Receive, + Scope, + Send, + formatdate, + md5_hexdigest, +) +from starlette.status import HTTP_200_OK, HTTP_206_PARTIAL_CONTENT + + +class FileResponseWithRange(FileResponse): + def __init__( + self, + path: typing.Union[str, "os.PathLike[str]"], + status_code: int = HTTP_200_OK, + *args, + range: typing.Optional[typing.Tuple[int, int]] = None, + **kwargs, + ): + if (range is not None) and (status_code != HTTP_206_PARTIAL_CONTENT): + raise RuntimeError( + f"Range requests must have a {HTTP_206_PARTIAL_CONTENT} status code." + ) + self.range = range + super().__init__(path, status_code, *args, **kwargs) + + def set_stat_headers(self, stat_result: os.stat_result) -> None: + content_length = str(stat_result.st_size) + size = str(stat_result.st_size) + last_modified = formatdate(stat_result.st_mtime, usegmt=True) + etag_base = str(stat_result.st_mtime) + "-" + str(stat_result.st_size) + if self.range is not None: + start, end = self.range + etag_base += f"-{start}/{end}" + content_length = str(end - start + 1) + self.headers.setdefault("accept-ranges", "bytes") + self.headers.setdefault("content-range", f"bytes {start}-{end}/{size}") + else: + content_length = size + etag = md5_hexdigest(etag_base.encode(), usedforsecurity=False) + + self.headers.setdefault("content-length", content_length) + self.headers.setdefault("last-modified", last_modified) + self.headers.setdefault("etag", etag) + + async def __call__(self, scope: Scope, receive: Receive, send: Send) -> None: + if self.stat_result is None: + try: + stat_result = await anyio.to_thread.run_sync(os.stat, self.path) + self.set_stat_headers(stat_result) + except FileNotFoundError: + raise RuntimeError(f"File at path {self.path} does not exist.") + else: + mode = stat_result.st_mode + if not stat.S_ISREG(mode): + raise RuntimeError(f"File at path {self.path} is not a file.") + await send( + { + "type": "http.response.start", + "status": self.status_code, + "headers": self.raw_headers, + } + ) + if scope["method"].upper() == "HEAD": + await send({"type": "http.response.body", "body": b"", "more_body": False}) + elif "extensions" in scope and "http.response.pathsend" in scope["extensions"]: + await send({"type": "http.response.pathsend", "path": str(self.path)}) + else: + async with await anyio.open_file(self.path, mode="rb") as file: + if self.range is not None: + start, end = self.range + await file.seek(start) + else: + start, end = 0, stat_result.st_size - 1 + remaining_bytes = end - start + 1 + more_body = True + while more_body: + chunk_size = min(remaining_bytes, self.chunk_size) + chunk = await file.read(chunk_size) + remaining_bytes -= len(chunk) + more_body = remaining_bytes > 0 and len(chunk) == chunk_size + await send( + { + "type": "http.response.body", + "body": chunk, + "more_body": more_body, + } + ) + if self.background is not None: + await self.background() diff --git a/tiled/server/router.py b/tiled/server/router.py index aff4a0f03..a81a2fea3 100644 --- a/tiled/server/router.py +++ b/tiled/server/router.py @@ -1,6 +1,7 @@ import dataclasses import inspect import os +import re import warnings from datetime import datetime, timedelta from functools import partial @@ -13,14 +14,15 @@ from json_merge_patch import merge as apply_merge_patch from jsonpatch import apply_patch as apply_json_patch from pydantic_settings import BaseSettings -from starlette.responses import FileResponse from starlette.status import ( HTTP_200_OK, + HTTP_206_PARTIAL_CONTENT, HTTP_400_BAD_REQUEST, HTTP_403_FORBIDDEN, HTTP_404_NOT_FOUND, HTTP_405_METHOD_NOT_ALLOWED, HTTP_406_NOT_ACCEPTABLE, + HTTP_416_REQUESTED_RANGE_NOT_SATISFIABLE, HTTP_422_UNPROCESSABLE_ENTITY, ) @@ -55,6 +57,7 @@ get_validation_registry, slice_, ) +from .file_response_with_range import FileResponseWithRange from .links import links_for_node from .settings import get_settings from .utils import filter_for_access, get_base_url, record_timing @@ -1519,6 +1522,12 @@ async def delete_revision( return json_or_msgpack(request, None) +# For simplicity of implementation, we support a restricted subset of the full +# Range spec. This could be extended if the need arises. +# https://developer.mozilla.org/en-US/docs/Web/HTTP/Headers/Range +RANGE_HEADER_PATTERN = re.compile(r"^bytes=(\d+)-(\d+)$") + + @router.get("/asset/bytes/{path:path}") async def get_asset( request: Request, @@ -1587,12 +1596,34 @@ async def get_asset( full_path = path stat_result = await anyio.to_thread.run_sync(os.stat, full_path) filename = full_path.name - return FileResponse( + if "range" in request.headers: + range_header = request.headers["range"] + match = RANGE_HEADER_PATTERN.match(range_header) + if match is None: + raise HTTPException( + status_code=HTTP_400_BAD_REQUEST, + detail=( + "Only a Range headers of the form 'bytes=start-end' are supported. " + f"Could not parse Range header: {range_header}", + ), + ) + range = start, _ = (int(match.group(1)), int(match.group(2))) + if start > stat_result.st_size: + raise HTTPException( + status_code=HTTP_416_REQUESTED_RANGE_NOT_SATISFIABLE, + headers={"content-range": f"bytes */{stat_result.st_size}"}, + ) + status_code = HTTP_206_PARTIAL_CONTENT + else: + range = None + status_code = HTTP_200_OK + return FileResponseWithRange( full_path, stat_result=stat_result, method="GET", - status_code=HTTP_200_OK, + status_code=status_code, headers={"Content-Disposition": f'attachment; filename="{filename}"'}, + range=range, )