Skip to content
This repository has been archived by the owner on Apr 26, 2024. It is now read-only.

Prevent local quarantined media from being claimed by media retention #12972

Merged
merged 11 commits into from
Jun 7, 2022
Merged
1 change: 1 addition & 0 deletions changelog.d/12972.feature
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
Add new `media_retention` options to the homeserver config for routinely cleaning up non-recently accessed media.
6 changes: 6 additions & 0 deletions docs/usage/configuration/config_documentation.md
Original file line number Diff line number Diff line change
Expand Up @@ -1583,6 +1583,12 @@ been accessed, the media's creation time is used instead. Both thumbnails
and the original media will be removed. If either of these options are unset,
then media of that type will not be purged.

Local or cached remote media that has been
[quarantined](../../admin_api/media_admin_api.md#quarantining-media-in-a-room)
will not be deleted. Similarly, local media that has been marked as
[protected from quarantine](../../admin_api/media_admin_api.md#protecting-media-from-being-quarantined)
will not be deleted.

Example configuration:
```yaml
media_retention:
Expand Down
8 changes: 3 additions & 5 deletions synapse/rest/admin/media.py
Original file line number Diff line number Diff line change
Expand Up @@ -83,7 +83,7 @@ async def on_POST(
requester = await self.auth.get_user_by_req(request)
await assert_user_is_admin(self.auth, requester.user)

logging.info("Quarantining local media by user: %s", user_id)
logging.info("Quarantining media by user: %s", user_id)

# Quarantine all media this user has uploaded
num_quarantined = await self.store.quarantine_media_ids_by_user(
Expand Down Expand Up @@ -112,7 +112,7 @@ async def on_POST(
requester = await self.auth.get_user_by_req(request)
await assert_user_is_admin(self.auth, requester.user)

logging.info("Quarantining local media by ID: %s/%s", server_name, media_id)
logging.info("Quarantining media by ID: %s/%s", server_name, media_id)

# Quarantine this media id
await self.store.quarantine_media_by_id(
Expand Down Expand Up @@ -140,9 +140,7 @@ async def on_POST(
) -> Tuple[int, JsonDict]:
await assert_requester_is_admin(self.auth, request)

logging.info(
"Remove from quarantine local media by ID: %s/%s", server_name, media_id
)
logging.info("Remove from quarantine media by ID: %s/%s", server_name, media_id)

# Remove from quarantine this media id
await self.store.quarantine_media_by_id(server_name, media_id, None)
Expand Down
21 changes: 15 additions & 6 deletions synapse/rest/media/v1/media_repository.py
Original file line number Diff line number Diff line change
Expand Up @@ -919,10 +919,14 @@ async def _apply_media_retention_rules(self) -> None:
await self.delete_old_local_media(
before_ts=local_media_threshold_timestamp_ms,
keep_profiles=True,
delete_quarantined_media=False,
delete_protected_media=False,
)

async def delete_old_remote_media(self, before_ts: int) -> Dict[str, int]:
old_media = await self.store.get_remote_media_before(before_ts)
old_media = await self.store.get_remote_media_ids(
before_ts, include_quarantined_media=False
)

deleted = 0

Expand Down Expand Up @@ -975,25 +979,30 @@ async def delete_old_local_media(
before_ts: int,
size_gt: int = 0,
keep_profiles: bool = True,
delete_quarantined_media: bool = False,
delete_protected_media: bool = False,
) -> Tuple[List[str], int]:
"""
Delete local or remote media from this server by size and timestamp. Removes
media files, any thumbnails and cached URLs.

Args:
before_ts: Unix timestamp in ms.
Files that were last used before this timestamp will be deleted
size_gt: Size of the media in bytes. Files that are larger will be deleted
Files that were last used before this timestamp will be deleted.
size_gt: Size of the media in bytes. Files that are larger will be deleted.
keep_profiles: Switch to delete also files that are still used in image data
(e.g user profile, room avatar)
If false these files will be deleted
(e.g user profile, room avatar). If false these files will be deleted.
delete_quarantined_media: If True, media marked as quarantined will be deleted.

Returns:
A tuple of (list of deleted media IDs, total deleted media IDs).
"""
old_media = await self.store.get_local_media_before(
old_media = await self.store.get_local_media_ids(
before_ts,
size_gt,
keep_profiles,
include_quarantined_media=delete_quarantined_media,
include_protected_media=delete_protected_media,
)
return await self._remove_local_media_from_disk(old_media)

Expand Down
68 changes: 63 additions & 5 deletions synapse/storage/databases/main/media_repository.py
Original file line number Diff line number Diff line change
Expand Up @@ -251,12 +251,36 @@ def get_local_media_by_user_paginate_txn(
"get_local_media_by_user_paginate_txn", get_local_media_by_user_paginate_txn
)

async def get_local_media_before(
async def get_local_media_ids(
self,
before_ts: int,
size_gt: int,
keep_profiles: bool,
include_quarantined_media: bool,
include_protected_media: bool,
) -> List[str]:
"""
Retrieve a list of media IDs from the local media store.

Args:
before_ts: Only retrieve IDs from media that was either last accessed
(or if never accessed, created) before the given UNIX timestamp in ms.
size_gt: Only retrieve IDs from media that has a size (in bytes) greater than
the given integer.
keep_profiles: If True, exclude media IDs from the results that are used in the
following situations:
* global profile user avatar
* per-room profile user avatar
* room avatar
* a user's avatar in the user directory
include_quarantined_media: If False, exclude media IDs from the results that have
been marked as quarantined.
include_protected_media: If False, exclude media IDs from the results that have
been marked as protected from quarantine.

Returns:
A list of local media IDs.
"""

# to find files that have never been accessed (last_access_ts IS NULL)
# compare with `created_ts`
Expand Down Expand Up @@ -294,12 +318,24 @@ async def get_local_media_before(
)
sql += sql_keep

def _get_local_media_before_txn(txn: LoggingTransaction) -> List[str]:
if include_quarantined_media is False:
# Do not include media that has been quarantined
sql += """
AND quarantined_by IS NULL
"""

if include_protected_media is False:
# Do not include media that has been protected from quarantine
sql += """
AND safe_from_quarantine = 0
"""

def _get_local_media_ids_txn(txn: LoggingTransaction) -> List[str]:
txn.execute(sql, (before_ts, before_ts, size_gt))
return [row[0] for row in txn]

return await self.db_pool.runInteraction(
"get_local_media_before", _get_local_media_before_txn
"get_local_media_ids", _get_local_media_ids_txn
)

async def store_local_media(
Expand Down Expand Up @@ -599,15 +635,37 @@ async def store_remote_media_thumbnail(
desc="store_remote_media_thumbnail",
)

async def get_remote_media_before(self, before_ts: int) -> List[Dict[str, str]]:
async def get_remote_media_ids(
self, before_ts: int, include_quarantined_media: bool
) -> List[Dict[str, str]]:
"""
Retrieve a list of server name, media ID tuples from the remote media cache.

Args:
before_ts: Only retrieve IDs from media that was either last accessed
(or if never accessed, created) before the given UNIX timestamp in ms.
include_quarantined_media: If False, exclude media IDs from the results that have
been marked as quarantined.

Returns:
A list of tuples containing:
* The server name of homeserver where the media originates from,
* The ID of the media.
"""
sql = (
"SELECT media_origin, media_id, filesystem_id"
" FROM remote_media_cache"
" WHERE last_access_ts < ?"
)

if include_quarantined_media is False:
# Only include media that has not been quarantined
sql += """
AND quarantined_by IS NULL
"""

return await self.db_pool.execute(
"get_remote_media_before", self.db_pool.cursor_to_dict, sql, before_ts
"get_remote_media_ids", self.db_pool.cursor_to_dict, sql, before_ts
)

async def delete_remote_media(self, media_origin: str, media_id: str) -> None:
Expand Down
109 changes: 96 additions & 13 deletions tests/rest/media/test_media_retention.py
Original file line number Diff line number Diff line change
Expand Up @@ -53,13 +53,16 @@ def prepare(self, reactor: MemoryReactor, clock: Clock, hs: HomeServer) -> None:
# Create a user to upload media with
test_user_id = self.register_user("alice", "password")

# Inject media (3 images each; recently accessed, old access, never accessed)
# into both the local store and the remote cache
# Inject media (recently accessed, old access, never accessed, old access
# quarantined media) into both the local store and the remote cache, plus
# one additional local media that is marked as protected from quarantine.
media_repository = hs.get_media_repository()
test_media_content = b"example string"

def _create_media_and_set_last_accessed(
def _create_media_and_set_attributes(
last_accessed_ms: Optional[int],
is_quarantined: Optional[bool] = False,
is_protected: Optional[bool] = False,
) -> str:
# "Upload" some media to the local media store
mxc_uri = self.get_success(
Expand All @@ -84,10 +87,31 @@ def _create_media_and_set_last_accessed(
)
)

if is_quarantined:
# Mark this media as quarantined
self.get_success(
self.store.quarantine_media_by_id(
server_name=self.hs.config.server.server_name,
media_id=media_id,
quarantined_by="@theadmin:test",
)
)

if is_protected:
# Mark this media as protected from quarantine
self.get_success(
self.store.mark_local_media_as_safe(
media_id=media_id,
safe=True,
)
)

return media_id

def _cache_remote_media_and_set_last_accessed(
media_id: str, last_accessed_ms: Optional[int]
def _cache_remote_media_and_set_attributes(
media_id: str,
last_accessed_ms: Optional[int],
is_quarantined: Optional[bool] = False,
) -> str:
# Pretend to cache some remote media
self.get_success(
Expand All @@ -112,23 +136,58 @@ def _cache_remote_media_and_set_last_accessed(
)
)

if is_quarantined:
# Mark this media as quarantined
self.get_success(
self.store.quarantine_media_by_id(
server_name=self.remote_server_name,
media_id=media_id,
quarantined_by="@theadmin:test",
)
)

return media_id

# Start with the local media store
self.local_recently_accessed_media = _create_media_and_set_last_accessed(
self.THIRTY_DAYS_IN_MS
self.local_recently_accessed_media = _create_media_and_set_attributes(
last_accessed_ms=self.THIRTY_DAYS_IN_MS,
)
self.local_not_recently_accessed_media = _create_media_and_set_last_accessed(
self.ONE_DAY_IN_MS
self.local_not_recently_accessed_media = _create_media_and_set_attributes(
last_accessed_ms=self.ONE_DAY_IN_MS,
)
self.local_not_recently_accessed_quarantined_media = (
_create_media_and_set_attributes(
last_accessed_ms=self.ONE_DAY_IN_MS,
is_quarantined=True,
)
)
self.local_not_recently_accessed_protected_media = (
_create_media_and_set_attributes(
last_accessed_ms=self.ONE_DAY_IN_MS,
is_protected=True,
)
)
self.local_never_accessed_media = _create_media_and_set_attributes(
last_accessed_ms=None,
)
self.local_never_accessed_media = _create_media_and_set_last_accessed(None)

# And now the remote media store
self.remote_recently_accessed_media = _cache_remote_media_and_set_last_accessed(
"a", self.THIRTY_DAYS_IN_MS
self.remote_recently_accessed_media = _cache_remote_media_and_set_attributes(
media_id="a",
last_accessed_ms=self.THIRTY_DAYS_IN_MS,
)
self.remote_not_recently_accessed_media = (
_cache_remote_media_and_set_last_accessed("b", self.ONE_DAY_IN_MS)
_cache_remote_media_and_set_attributes(
media_id="b",
last_accessed_ms=self.ONE_DAY_IN_MS,
)
)
self.remote_not_recently_accessed_quarantined_media = (
_cache_remote_media_and_set_attributes(
media_id="c",
last_accessed_ms=self.ONE_DAY_IN_MS,
is_quarantined=True,
)
)
# Remote media will always have a "last accessed" attribute, as it would not
# be fetched from the remote homeserver unless instigated by a user.
Expand Down Expand Up @@ -163,8 +222,20 @@ def test_local_media_retention(self) -> None:
],
not_purged=[
(self.hs.config.server.server_name, self.local_recently_accessed_media),
(
self.hs.config.server.server_name,
self.local_not_recently_accessed_quarantined_media,
),
(
self.hs.config.server.server_name,
self.local_not_recently_accessed_protected_media,
),
(self.remote_server_name, self.remote_recently_accessed_media),
(self.remote_server_name, self.remote_not_recently_accessed_media),
(
self.remote_server_name,
self.remote_not_recently_accessed_quarantined_media,
),
],
)

Expand Down Expand Up @@ -199,6 +270,18 @@ def test_remote_media_cache_retention(self) -> None:
self.hs.config.server.server_name,
self.local_not_recently_accessed_media,
),
(
self.hs.config.server.server_name,
self.local_not_recently_accessed_quarantined_media,
),
(
self.hs.config.server.server_name,
self.local_not_recently_accessed_protected_media,
),
(
self.remote_server_name,
self.remote_not_recently_accessed_quarantined_media,
),
(self.hs.config.server.server_name, self.local_never_accessed_media),
],
)
Expand Down