Skip to content

Commit

Permalink
Range requests (bluesky#762)
Browse files Browse the repository at this point in the history
* Support Range header on GET /asset/bytes

* Add to CHANGELONG

* Remove stray trailing comma

* update literal http status codes with starlette.status status codes

* add CI suggestion

* Clarify comment

---------

Co-authored-by: Hiran Wijesinghe <wwijesing1@bnl.gov>
  • Loading branch information
danielballan and hyperrealist committed Jun 27, 2024
1 parent 99d5682 commit 1e94f4d
Show file tree
Hide file tree
Showing 4 changed files with 187 additions and 4 deletions.
4 changes: 4 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,10 @@ Write the date in place of the "Unreleased" in the case a new version is release

## Unreleased

### Added
- Support partial download of an asset using the
[HTTP `Range` Header](https://developer.mozilla.org/en-US/docs/Web/HTTP/Headers/Range).

### Fixed
- When authenticated as a Service Principal, display the SP's uuid in
the client Context repr.
Expand Down
45 changes: 44 additions & 1 deletion tiled/_tests/test_asset_access.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,13 @@
import hashlib
from pathlib import Path

import pandas
import pytest
from starlette.status import HTTP_403_FORBIDDEN
from starlette.status import (
HTTP_400_BAD_REQUEST,
HTTP_403_FORBIDDEN,
HTTP_416_REQUESTED_RANGE_NOT_SATISFIABLE,
)

from ..catalog import in_memory
from ..client import Context, from_context
Expand Down Expand Up @@ -66,6 +71,44 @@ def test_raw_export(client, tmpdir):
assert orig_hashes == exported_hashes


def test_asset_range_request(client, tmpdir):
"Access part of an asset using an HTTP Range header."
df = pandas.DataFrame({"A": [1, 2, 3], "B": [4.0, 5.0, 6.0]})
client.write_dataframe(df, key="x")
# Fetch the first byte.
first_byte_response = client.context.http_client.get(
"/api/v1/asset/bytes/x?id=1",
headers={"Range": "bytes=0-0"},
)
assert first_byte_response.content == b"P"
# Fetch the first two bytes.
first_two_bytes_response = client.context.http_client.get(
"/api/v1/asset/bytes/x?id=1",
headers={"Range": "bytes=0-1"},
)
assert first_two_bytes_response.content == b"PA"
# Fetch the second two bytes.
second_two_bytes_response = client.context.http_client.get(
"/api/v1/asset/bytes/x?id=1",
headers={"Range": "bytes=2-3"},
)
assert second_two_bytes_response.content == b"R1"
# Request outside of range
out_of_range_response = client.context.http_client.get(
"/api/v1/asset/bytes/x?id=1",
headers={"Range": "bytes=1000000-100000000"},
)
with fail_with_status_code(HTTP_416_REQUESTED_RANGE_NOT_SATISFIABLE):
out_of_range_response.raise_for_status()
# Request malformed range
malformed_response = client.context.http_client.get(
"/api/v1/asset/bytes/x?id=1",
headers={"Range": "bytes=abc"},
)
with fail_with_status_code(HTTP_400_BAD_REQUEST):
malformed_response.raise_for_status()


def test_get_asset_filepaths(client):
"Smoke test get_asset_filepaths."
client.write_array([1, 2, 3], key="x")
Expand Down
105 changes: 105 additions & 0 deletions tiled/server/file_response_with_range.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,105 @@
# This is a variation on starlette's FileRespones that adds support for the
# 'Range' HTTP header.

# It is adapted from a closed PR in starlette which was reviewed by a core
# starlette maintainer but put aside for now in favor of other priorities in
# starlette development. Thus, we implement it here in tiled. If in the future
# starlette adds support upstream, we should consider refactoring to use that.

# Ref: https://github.com/encode/starlette/pull/1999
import os
import stat
import typing

import anyio
from starlette.responses import (
FileResponse,
Receive,
Scope,
Send,
formatdate,
md5_hexdigest,
)
from starlette.status import HTTP_200_OK, HTTP_206_PARTIAL_CONTENT


class FileResponseWithRange(FileResponse):
def __init__(
self,
path: typing.Union[str, "os.PathLike[str]"],
status_code: int = HTTP_200_OK,
*args,
range: typing.Optional[typing.Tuple[int, int]] = None,
**kwargs,
):
if (range is not None) and (status_code != HTTP_206_PARTIAL_CONTENT):
raise RuntimeError(
f"Range requests must have a {HTTP_206_PARTIAL_CONTENT} status code."
)
self.range = range
super().__init__(path, status_code, *args, **kwargs)

def set_stat_headers(self, stat_result: os.stat_result) -> None:
content_length = str(stat_result.st_size)
size = str(stat_result.st_size)
last_modified = formatdate(stat_result.st_mtime, usegmt=True)
etag_base = str(stat_result.st_mtime) + "-" + str(stat_result.st_size)
if self.range is not None:
start, end = self.range
etag_base += f"-{start}/{end}"
content_length = str(end - start + 1)
self.headers.setdefault("accept-ranges", "bytes")
self.headers.setdefault("content-range", f"bytes {start}-{end}/{size}")
else:
content_length = size
etag = md5_hexdigest(etag_base.encode(), usedforsecurity=False)

self.headers.setdefault("content-length", content_length)
self.headers.setdefault("last-modified", last_modified)
self.headers.setdefault("etag", etag)

async def __call__(self, scope: Scope, receive: Receive, send: Send) -> None:
if self.stat_result is None:
try:
stat_result = await anyio.to_thread.run_sync(os.stat, self.path)
self.set_stat_headers(stat_result)
except FileNotFoundError:
raise RuntimeError(f"File at path {self.path} does not exist.")
else:
mode = stat_result.st_mode
if not stat.S_ISREG(mode):
raise RuntimeError(f"File at path {self.path} is not a file.")
await send(
{
"type": "http.response.start",
"status": self.status_code,
"headers": self.raw_headers,
}
)
if scope["method"].upper() == "HEAD":
await send({"type": "http.response.body", "body": b"", "more_body": False})
elif "extensions" in scope and "http.response.pathsend" in scope["extensions"]:
await send({"type": "http.response.pathsend", "path": str(self.path)})
else:
async with await anyio.open_file(self.path, mode="rb") as file:
if self.range is not None:
start, end = self.range
await file.seek(start)
else:
start, end = 0, stat_result.st_size - 1
remaining_bytes = end - start + 1
more_body = True
while more_body:
chunk_size = min(remaining_bytes, self.chunk_size)
chunk = await file.read(chunk_size)
remaining_bytes -= len(chunk)
more_body = remaining_bytes > 0 and len(chunk) == chunk_size
await send(
{
"type": "http.response.body",
"body": chunk,
"more_body": more_body,
}
)
if self.background is not None:
await self.background()
37 changes: 34 additions & 3 deletions tiled/server/router.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
import dataclasses
import inspect
import os
import re
import warnings
from datetime import datetime, timedelta
from functools import partial
Expand All @@ -13,14 +14,15 @@
from json_merge_patch import merge as apply_merge_patch
from jsonpatch import apply_patch as apply_json_patch
from pydantic_settings import BaseSettings
from starlette.responses import FileResponse
from starlette.status import (
HTTP_200_OK,
HTTP_206_PARTIAL_CONTENT,
HTTP_400_BAD_REQUEST,
HTTP_403_FORBIDDEN,
HTTP_404_NOT_FOUND,
HTTP_405_METHOD_NOT_ALLOWED,
HTTP_406_NOT_ACCEPTABLE,
HTTP_416_REQUESTED_RANGE_NOT_SATISFIABLE,
HTTP_422_UNPROCESSABLE_ENTITY,
)

Expand Down Expand Up @@ -55,6 +57,7 @@
get_validation_registry,
slice_,
)
from .file_response_with_range import FileResponseWithRange
from .links import links_for_node
from .settings import get_settings
from .utils import filter_for_access, get_base_url, record_timing
Expand Down Expand Up @@ -1519,6 +1522,12 @@ async def delete_revision(
return json_or_msgpack(request, None)


# For simplicity of implementation, we support a restricted subset of the full
# Range spec. This could be extended if the need arises.
# https://developer.mozilla.org/en-US/docs/Web/HTTP/Headers/Range
RANGE_HEADER_PATTERN = re.compile(r"^bytes=(\d+)-(\d+)$")


@router.get("/asset/bytes/{path:path}")
async def get_asset(
request: Request,
Expand Down Expand Up @@ -1587,12 +1596,34 @@ async def get_asset(
full_path = path
stat_result = await anyio.to_thread.run_sync(os.stat, full_path)
filename = full_path.name
return FileResponse(
if "range" in request.headers:
range_header = request.headers["range"]
match = RANGE_HEADER_PATTERN.match(range_header)
if match is None:
raise HTTPException(
status_code=HTTP_400_BAD_REQUEST,
detail=(
"Only a Range headers of the form 'bytes=start-end' are supported. "
f"Could not parse Range header: {range_header}",
),
)
range = start, _ = (int(match.group(1)), int(match.group(2)))
if start > stat_result.st_size:
raise HTTPException(
status_code=HTTP_416_REQUESTED_RANGE_NOT_SATISFIABLE,
headers={"content-range": f"bytes */{stat_result.st_size}"},
)
status_code = HTTP_206_PARTIAL_CONTENT
else:
range = None
status_code = HTTP_200_OK
return FileResponseWithRange(
full_path,
stat_result=stat_result,
method="GET",
status_code=HTTP_200_OK,
status_code=status_code,
headers={"Content-Disposition": f'attachment; filename="{filename}"'},
range=range,
)


Expand Down

0 comments on commit 1e94f4d

Please sign in to comment.