Skip to content
This repository has been archived by the owner on Apr 26, 2024. It is now read-only.

Commit

Permalink
Avoid storing URL cache files in storage providers (#10911)
Browse files Browse the repository at this point in the history
URL cache files are short-lived and it does not make sense to offload
them (eg. to the cloud) or back them up.
  • Loading branch information
squahtx authored Sep 27, 2021
1 parent 6c83c27 commit f7768f6
Show file tree
Hide file tree
Showing 6 changed files with 154 additions and 6 deletions.
1 change: 1 addition & 0 deletions changelog.d/10911.bugfix
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
Avoid storing URL cache files in storage providers. Server admins may safely delete the `url_cache/` and `url_cache_thumbnails/` directories from any configured storage providers to reclaim space.
7 changes: 7 additions & 0 deletions docs/upgrade.md
Original file line number Diff line number Diff line change
Expand Up @@ -85,6 +85,13 @@ process, for example:
dpkg -i matrix-synapse-py3_1.3.0+stretch1_amd64.deb
```

# Upgrading to v1.44.0

## The URL preview cache is no longer mirrored to storage providers
The `url_cache/` and `url_cache_thumbnails/` directories in the media store are
no longer mirrored to storage providers. These two directories can be safely
deleted from any configured storage providers to reclaim space.

# Upgrading to v1.43.0

## The spaces summary APIs can now be handled by workers
Expand Down
11 changes: 6 additions & 5 deletions synapse/rest/media/v1/filepath.py
Original file line number Diff line number Diff line change
Expand Up @@ -195,23 +195,24 @@ def url_cache_thumbnail_rel(

url_cache_thumbnail = _wrap_in_base_path(url_cache_thumbnail_rel)

def url_cache_thumbnail_directory(self, media_id: str) -> str:
def url_cache_thumbnail_directory_rel(self, media_id: str) -> str:
# Media id is of the form <DATE><RANDOM_STRING>
# E.g.: 2017-09-28-fsdRDt24DS234dsf

if NEW_FORMAT_ID_RE.match(media_id):
return os.path.join(
self.base_path, "url_cache_thumbnails", media_id[:10], media_id[11:]
)
return os.path.join("url_cache_thumbnails", media_id[:10], media_id[11:])
else:
return os.path.join(
self.base_path,
"url_cache_thumbnails",
media_id[0:2],
media_id[2:4],
media_id[4:],
)

url_cache_thumbnail_directory = _wrap_in_base_path(
url_cache_thumbnail_directory_rel
)

def url_cache_thumbnail_dirs_to_delete(self, media_id: str) -> List[str]:
"The dirs to try and remove if we delete the media_id thumbnails"
# Media id is of the form <DATE><RANDOM_STRING>
Expand Down
1 change: 0 additions & 1 deletion synapse/rest/media/v1/preview_url_resource.py
Original file line number Diff line number Diff line change
Expand Up @@ -485,7 +485,6 @@ def _start_expire_url_cache_data(self) -> Deferred:

async def _expire_url_cache_data(self) -> None:
"""Clean up expired url cache content, media and thumbnails."""
# TODO: Delete from backup media store

assert self._worker_run_media_background_jobs

Expand Down
10 changes: 10 additions & 0 deletions synapse/rest/media/v1/storage_provider.py
Original file line number Diff line number Diff line change
Expand Up @@ -93,6 +93,11 @@ async def store_file(self, path: str, file_info: FileInfo) -> None:
if file_info.server_name and not self.store_remote:
return None

if file_info.url_cache:
# The URL preview cache is short lived and not worth offloading or
# backing up.
return None

if self.store_synchronous:
# store_file is supposed to return an Awaitable, but guard
# against improper implementations.
Expand All @@ -110,6 +115,11 @@ async def store() -> None:
run_in_background(store)

async def fetch(self, path: str, file_info: FileInfo) -> Optional[Responder]:
if file_info.url_cache:
# Files in the URL preview cache definitely aren't stored here,
# so avoid any potentially slow I/O or network access.
return None

# store_file is supposed to return an Awaitable, but guard
# against improper implementations.
return await maybe_awaitable(self.backend.fetch(path, file_info))
Expand Down
130 changes: 130 additions & 0 deletions tests/rest/media/v1/test_url_preview.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@
from twisted.test.proto_helpers import AccumulatingProtocol

from synapse.config.oembed import OEmbedEndpointConfig
from synapse.util.stringutils import parse_and_validate_mxc_uri

from tests import unittest
from tests.server import FakeTransport
Expand Down Expand Up @@ -721,3 +722,132 @@ def test_oembed_format(self):
"og:description": "Content Preview",
},
)

def _download_image(self):
"""Downloads an image into the URL cache.
Returns:
A (host, media_id) tuple representing the MXC URI of the image.
"""
self.lookups["cdn.twitter.com"] = [(IPv4Address, "10.1.2.3")]

channel = self.make_request(
"GET",
"preview_url?url=http://cdn.twitter.com/matrixdotorg",
shorthand=False,
await_result=False,
)
self.pump()

client = self.reactor.tcpClients[0][2].buildProtocol(None)
server = AccumulatingProtocol()
server.makeConnection(FakeTransport(client, self.reactor))
client.makeConnection(FakeTransport(server, self.reactor))
client.dataReceived(
b"HTTP/1.0 200 OK\r\nContent-Length: %d\r\nContent-Type: image/png\r\n\r\n"
% (len(SMALL_PNG),)
+ SMALL_PNG
)

self.pump()
self.assertEqual(channel.code, 200)
body = channel.json_body
mxc_uri = body["og:image"]
host, _port, media_id = parse_and_validate_mxc_uri(mxc_uri)
self.assertIsNone(_port)
return host, media_id

def test_storage_providers_exclude_files(self):
"""Test that files are not stored in or fetched from storage providers."""
host, media_id = self._download_image()

rel_file_path = self.preview_url.filepaths.url_cache_filepath_rel(media_id)
media_store_path = os.path.join(self.media_store_path, rel_file_path)
storage_provider_path = os.path.join(self.storage_path, rel_file_path)

# Check storage
self.assertTrue(os.path.isfile(media_store_path))
self.assertFalse(
os.path.isfile(storage_provider_path),
"URL cache file was unexpectedly stored in a storage provider",
)

# Check fetching
channel = self.make_request(
"GET",
f"download/{host}/{media_id}",
shorthand=False,
await_result=False,
)
self.pump()
self.assertEqual(channel.code, 200)

# Move cached file into the storage provider
os.makedirs(os.path.dirname(storage_provider_path), exist_ok=True)
os.rename(media_store_path, storage_provider_path)

channel = self.make_request(
"GET",
f"download/{host}/{media_id}",
shorthand=False,
await_result=False,
)
self.pump()
self.assertEqual(
channel.code,
404,
"URL cache file was unexpectedly retrieved from a storage provider",
)

def test_storage_providers_exclude_thumbnails(self):
"""Test that thumbnails are not stored in or fetched from storage providers."""
host, media_id = self._download_image()

rel_thumbnail_path = (
self.preview_url.filepaths.url_cache_thumbnail_directory_rel(media_id)
)
media_store_thumbnail_path = os.path.join(
self.media_store_path, rel_thumbnail_path
)
storage_provider_thumbnail_path = os.path.join(
self.storage_path, rel_thumbnail_path
)

# Check storage
self.assertTrue(os.path.isdir(media_store_thumbnail_path))
self.assertFalse(
os.path.isdir(storage_provider_thumbnail_path),
"URL cache thumbnails were unexpectedly stored in a storage provider",
)

# Check fetching
channel = self.make_request(
"GET",
f"thumbnail/{host}/{media_id}?width=32&height=32&method=scale",
shorthand=False,
await_result=False,
)
self.pump()
self.assertEqual(channel.code, 200)

# Remove the original, otherwise thumbnails will regenerate
rel_file_path = self.preview_url.filepaths.url_cache_filepath_rel(media_id)
media_store_path = os.path.join(self.media_store_path, rel_file_path)
os.remove(media_store_path)

# Move cached thumbnails into the storage provider
os.makedirs(os.path.dirname(storage_provider_thumbnail_path), exist_ok=True)
os.rename(media_store_thumbnail_path, storage_provider_thumbnail_path)

channel = self.make_request(
"GET",
f"thumbnail/{host}/{media_id}?width=32&height=32&method=scale",
shorthand=False,
await_result=False,
)
self.pump()
self.assertEqual(
channel.code,
404,
"URL cache thumbnail was unexpectedly retrieved from a storage provider",
)

0 comments on commit f7768f6

Please sign in to comment.