Skip to content
This repository has been archived by the owner on Apr 26, 2024. It is now read-only.

Prevent local quarantined media from being claimed by media retention #12972

Merged
merged 11 commits into from
Jun 7, 2022
Merged
1 change: 1 addition & 0 deletions changelog.d/12972.feature
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
Add new `media_retention` options to the homeserver config for routinely cleaning up non-recently accessed media.
8 changes: 3 additions & 5 deletions synapse/rest/admin/media.py
Original file line number Diff line number Diff line change
Expand Up @@ -83,7 +83,7 @@ async def on_POST(
requester = await self.auth.get_user_by_req(request)
await assert_user_is_admin(self.auth, requester.user)

logging.info("Quarantining local media by user: %s", user_id)
logging.info("Quarantining media by user: %s", user_id)

# Quarantine all media this user has uploaded
num_quarantined = await self.store.quarantine_media_ids_by_user(
Expand Down Expand Up @@ -112,7 +112,7 @@ async def on_POST(
requester = await self.auth.get_user_by_req(request)
await assert_user_is_admin(self.auth, requester.user)

logging.info("Quarantining local media by ID: %s/%s", server_name, media_id)
logging.info("Quarantining media by ID: %s/%s", server_name, media_id)

# Quarantine this media id
await self.store.quarantine_media_by_id(
Expand Down Expand Up @@ -140,9 +140,7 @@ async def on_POST(
) -> Tuple[int, JsonDict]:
await assert_requester_is_admin(self.auth, request)

logging.info(
"Remove from quarantine local media by ID: %s/%s", server_name, media_id
)
logging.info("Remove from quarantine media by ID: %s/%s", server_name, media_id)

# Remove from quarantine this media id
await self.store.quarantine_media_by_id(server_name, media_id, None)
Expand Down
18 changes: 12 additions & 6 deletions synapse/rest/media/v1/media_repository.py
Original file line number Diff line number Diff line change
Expand Up @@ -919,10 +919,13 @@ async def _apply_media_retention_rules(self) -> None:
await self.delete_old_local_media(
before_ts=local_media_threshold_timestamp_ms,
keep_profiles=True,
delete_quarantined_media=False,
)

async def delete_old_remote_media(self, before_ts: int) -> Dict[str, int]:
old_media = await self.store.get_remote_media_before(before_ts)
old_media = await self.store.get_remote_media_ids(
before_ts, include_quarantined_media=False
)
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Hm, there's a bit of a discrepancy here in that delete_old_local_media gives you a choice of whether you want to delete quarantined media or not, whereas delete_old_remote_media decides for you (always false).

This arose from the fact that delete_old_local_media had an admin API attached to it which may want to sprout a new query parameter for whether quarantined media is deleted, whereas the remote media side had no such API.

It'd probably still be nice to align the two though.


deleted = 0

Expand Down Expand Up @@ -975,25 +978,28 @@ async def delete_old_local_media(
before_ts: int,
size_gt: int = 0,
keep_profiles: bool = True,
delete_quarantined_media: bool = False,
) -> Tuple[List[str], int]:
"""
Delete local or remote media from this server by size and timestamp. Removes
media files, any thumbnails and cached URLs.

Args:
before_ts: Unix timestamp in ms.
Files that were last used before this timestamp will be deleted
size_gt: Size of the media in bytes. Files that are larger will be deleted
Files that were last used before this timestamp will be deleted.
size_gt: Size of the media in bytes. Files that are larger will be deleted.
keep_profiles: Switch to delete also files that are still used in image data
(e.g user profile, room avatar)
If false these files will be deleted
(e.g user profile, room avatar). If false these files will be deleted.
delete_quarantined_media: If True, media marked as quarantined will be deleted.

Returns:
A tuple of (list of deleted media IDs, total deleted media IDs).
"""
old_media = await self.store.get_local_media_before(
old_media = await self.store.get_local_media_ids(
before_ts,
size_gt,
keep_profiles,
include_quarantined_media=delete_quarantined_media,
)
return await self._remove_local_media_from_disk(old_media)

Expand Down
59 changes: 54 additions & 5 deletions synapse/storage/databases/main/media_repository.py
Original file line number Diff line number Diff line change
Expand Up @@ -251,12 +251,33 @@ def get_local_media_by_user_paginate_txn(
"get_local_media_by_user_paginate_txn", get_local_media_by_user_paginate_txn
)

async def get_local_media_before(
async def get_local_media_ids(
self,
before_ts: int,
size_gt: int,
keep_profiles: bool,
include_quarantined_media: bool,
) -> List[str]:
"""
Retrieve a list of media IDs from the local media store.

Args:
before_ts: Only retrieve IDs from media that was either last accessed
(or if never accessed, created) before the given UNIX timestamp in ms.
size_gt: Only retrieve IDs from media that has a size (in bytes) greater than
the given integer.
keep_profiles: If True, exclude media IDs from the results that are used in the
following situations:
* global profile user avatar
* per-room profile user avatar
* room avatar
* a user's avatar in the user directory
include_quarantined_media: If False, exclude media IDs from the results that have
been marked as quarantined.

Returns:
A list of local media IDs.
"""

# to find files that have never been accessed (last_access_ts IS NULL)
# compare with `created_ts`
Expand Down Expand Up @@ -294,12 +315,18 @@ async def get_local_media_before(
)
sql += sql_keep

def _get_local_media_before_txn(txn: LoggingTransaction) -> List[str]:
if include_quarantined_media is False:
# Only include media that has not been quarantined
sql += """
AND quarantined_by IS NULL
"""

def _get_local_media_ids_txn(txn: LoggingTransaction) -> List[str]:
txn.execute(sql, (before_ts, before_ts, size_gt))
return [row[0] for row in txn]

return await self.db_pool.runInteraction(
"get_local_media_before", _get_local_media_before_txn
"get_local_media_ids", _get_local_media_ids_txn
)

async def store_local_media(
Expand Down Expand Up @@ -599,15 +626,37 @@ async def store_remote_media_thumbnail(
desc="store_remote_media_thumbnail",
)

async def get_remote_media_before(self, before_ts: int) -> List[Dict[str, str]]:
async def get_remote_media_ids(
self, before_ts: int, include_quarantined_media: bool
) -> List[Dict[str, str]]:
"""
Retrieve a list of server name, media ID tuples from the remote media cache.

Args:
before_ts: Only retrieve IDs from media that was either last accessed
(or if never accessed, created) before the given UNIX timestamp in ms.
include_quarantined_media: If False, exclude media IDs from the results that have
been marked as quarantined.

Returns:
A list of tuples containing:
* The server name of homeserver where the media originates from,
* The ID of the media.
"""
sql = (
"SELECT media_origin, media_id, filesystem_id"
" FROM remote_media_cache"
" WHERE last_access_ts < ?"
)

if include_quarantined_media is False:
# Only include media that has not been quarantined
sql += """
AND quarantined_by IS NULL
"""

return await self.db_pool.execute(
"get_remote_media_before", self.db_pool.cursor_to_dict, sql, before_ts
"get_remote_media_ids", self.db_pool.cursor_to_dict, sql, before_ts
)

async def delete_remote_media(self, media_origin: str, media_id: str) -> None:
Expand Down
80 changes: 69 additions & 11 deletions tests/rest/media/test_media_retention.py
Original file line number Diff line number Diff line change
Expand Up @@ -58,8 +58,9 @@ def prepare(self, reactor: MemoryReactor, clock: Clock, hs: HomeServer) -> None:
media_repository = hs.get_media_repository()
test_media_content = b"example string"

def _create_media_and_set_last_accessed(
def _create_media_and_set_attributes(
last_accessed_ms: Optional[int],
is_quarantined: Optional[bool] = False,
) -> str:
# "Upload" some media to the local media store
mxc_uri = self.get_success(
Expand All @@ -84,10 +85,22 @@ def _create_media_and_set_last_accessed(
)
)

if is_quarantined:
# Mark this media as quarantined
self.get_success(
self.store.quarantine_media_by_id(
server_name=self.hs.config.server.server_name,
media_id=media_id,
quarantined_by="@theadmin:test",
)
)

return media_id

def _cache_remote_media_and_set_last_accessed(
media_id: str, last_accessed_ms: Optional[int]
def _cache_remote_media_and_set_attributes(
media_id: str,
last_accessed_ms: Optional[int],
is_quarantined: Optional[bool] = False,
) -> str:
# Pretend to cache some remote media
self.get_success(
Expand All @@ -112,23 +125,52 @@ def _cache_remote_media_and_set_last_accessed(
)
)

if is_quarantined:
# Mark this media as quarantined
self.get_success(
self.store.quarantine_media_by_id(
server_name=self.remote_server_name,
media_id=media_id,
quarantined_by="@theadmin:test",
)
)

return media_id

# Start with the local media store
self.local_recently_accessed_media = _create_media_and_set_last_accessed(
self.THIRTY_DAYS_IN_MS
self.local_recently_accessed_media = _create_media_and_set_attributes(
last_accessed_ms=self.THIRTY_DAYS_IN_MS,
)
self.local_not_recently_accessed_media = _create_media_and_set_attributes(
last_accessed_ms=self.ONE_DAY_IN_MS,
)
self.local_not_recently_accessed_media = _create_media_and_set_last_accessed(
self.ONE_DAY_IN_MS
self.local_not_recently_accessed_quarantined_media = (
_create_media_and_set_attributes(
last_accessed_ms=self.ONE_DAY_IN_MS,
is_quarantined=True,
)
)
self.local_never_accessed_media = _create_media_and_set_attributes(
last_accessed_ms=None,
)
self.local_never_accessed_media = _create_media_and_set_last_accessed(None)

# And now the remote media store
self.remote_recently_accessed_media = _cache_remote_media_and_set_last_accessed(
"a", self.THIRTY_DAYS_IN_MS
self.remote_recently_accessed_media = _cache_remote_media_and_set_attributes(
media_id="a",
last_accessed_ms=self.THIRTY_DAYS_IN_MS,
)
self.remote_not_recently_accessed_media = (
_cache_remote_media_and_set_last_accessed("b", self.ONE_DAY_IN_MS)
_cache_remote_media_and_set_attributes(
media_id="b",
last_accessed_ms=self.ONE_DAY_IN_MS,
)
)
self.remote_not_recently_accessed_quarantined_media = (
_cache_remote_media_and_set_attributes(
media_id="c",
last_accessed_ms=self.ONE_DAY_IN_MS,
is_quarantined=True,
)
)
# Remote media will always have a "last accessed" attribute, as it would not
# be fetched from the remote homeserver unless instigated by a user.
Expand Down Expand Up @@ -163,8 +205,16 @@ def test_local_media_retention(self) -> None:
],
not_purged=[
(self.hs.config.server.server_name, self.local_recently_accessed_media),
(
self.hs.config.server.server_name,
self.local_not_recently_accessed_quarantined_media,
),
(self.remote_server_name, self.remote_recently_accessed_media),
(self.remote_server_name, self.remote_not_recently_accessed_media),
(
self.remote_server_name,
self.remote_not_recently_accessed_quarantined_media,
),
],
)

Expand Down Expand Up @@ -199,6 +249,14 @@ def test_remote_media_cache_retention(self) -> None:
self.hs.config.server.server_name,
self.local_not_recently_accessed_media,
),
(
self.hs.config.server.server_name,
self.local_not_recently_accessed_quarantined_media,
),
(
self.remote_server_name,
self.remote_not_recently_accessed_quarantined_media,
),
(self.hs.config.server.server_name, self.local_never_accessed_media),
],
)
Expand Down