From 21dcf4a62bbce8ec76303658a6663ba507e8516a Mon Sep 17 00:00:00 2001 From: Andrew Morgan Date: Fri, 13 May 2022 14:14:46 +0100 Subject: [PATCH 01/14] Add config options for media retention --- synapse/config/repository.py | 50 ++++++++++++++++++++++++++++++++++++ 1 file changed, 50 insertions(+) diff --git a/synapse/config/repository.py b/synapse/config/repository.py index 98d8a16621ad..cac6bf63f9a4 100644 --- a/synapse/config/repository.py +++ b/synapse/config/repository.py @@ -223,6 +223,23 @@ def read_config(self, config: JsonDict, **kwargs: Any) -> None: "url_preview_accept_language" ) or ["en"] + media_retention = config.get("media_retention") or {} + self.media_retention_enabled = media_retention.get("enabled", False) + + self.media_retention_local_media_lifetime_ms = None + local_media_lifetime = media_retention.get("local_media_lifetime") + if local_media_lifetime is not None: + self.media_retention_local_media_lifetime_ms = self.parse_duration( + local_media_lifetime + ) + + self.media_retention_remote_media_lifetime_ms = None + remote_media_lifetime = media_retention.get("remote_media_lifetime") + if remote_media_lifetime is not None: + self.media_retention_remote_media_lifetime_ms = self.parse_duration( + remote_media_lifetime + ) + def generate_config_section(self, data_dir_path: str, **kwargs: Any) -> str: assert data_dir_path is not None media_store = os.path.join(data_dir_path, "media_store") @@ -289,6 +306,39 @@ def generate_config_section(self, data_dir_path: str, **kwargs: Any) -> str: #thumbnail_sizes: %(formatted_thumbnail_sizes)s + # Configure media retention settings. Media will be purged if it + # has not been accessed in at least this amount of time. If the + # media has never been access, the media's creation time is used + # instead. Both thumbnails and the original media will be removed. + # + # Media is 'accessed' when loaded in a room in a client, or + # otherwise downloaded by a local or remote user. + # + media_retention: + # Whether media retention settings should apply. Defaults to + # false. + # + # Uncomment to enable media retention on this homeserver. + # + #enabled: true + + # How long to keep local media since its last access. Local + # media that is removed will be permanently deleted. + # + # If this option is not set, local media will not have a + # retention policy applied. + # + #local_media_lifetime: 30d + + # How long to keep downloaded remote media since its last + # access. Remote media will be downloaded again from the + # originating server on demand. + # + # If this option is not set, remote media will not have a + # retention policy applied. + # + remote_media_lifetime: 7d + # Is the preview URL API enabled? # # 'false' by default: uncomment the following to enable it (and specify a From 6046250532586689afb2347d311b6a5693d597e7 Mon Sep 17 00:00:00 2001 From: Andrew Morgan Date: Fri, 13 May 2022 14:50:57 +0100 Subject: [PATCH 02/14] Run a media retention background job using configured lifetimes --- synapse/rest/media/v1/media_repository.py | 70 ++++++++++++++++++++++- 1 file changed, 69 insertions(+), 1 deletion(-) diff --git a/synapse/rest/media/v1/media_repository.py b/synapse/rest/media/v1/media_repository.py index 3e5d6c629418..ca2f4b7340b1 100644 --- a/synapse/rest/media/v1/media_repository.py +++ b/synapse/rest/media/v1/media_repository.py @@ -65,7 +65,12 @@ logger = logging.getLogger(__name__) -UPDATE_RECENTLY_ACCESSED_TS = 60 * 1000 +# How often to run the background job to update the "recently accessed" +# attribute of local and remote media. +UPDATE_RECENTLY_ACCESSED_TS = 60 * 1000 # 1 minute +# How often to run the background job that purges local and remote media +# according to the configured media retention rules. +APPLY_MEDIA_RETENTION_RULES_PERIOD_MS = 60 * 60 * 1000 # 1 hour class MediaRepository: @@ -122,11 +127,32 @@ def __init__(self, hs: "HomeServer"): self._start_update_recently_accessed, UPDATE_RECENTLY_ACCESSED_TS ) + # Media retention configuration options + self._media_retention_local_media_lifetime_ms = ( + hs.config.media.media_retention_local_media_lifetime_ms + ) + self._media_retention_remote_media_lifetime_ms = ( + hs.config.media.media_retention_remote_media_lifetime_ms + ) + + if hs.config.media.media_retention_enabled: + # Run the background job to apply media retention rules every + # $APPLY_MEDIA_RETENTION_RULES_PERIOD_MS milliseconds. + self.clock.looping_call( + self._start_apply_media_retention_rules, + APPLY_MEDIA_RETENTION_RULES_PERIOD_MS, + ) + def _start_update_recently_accessed(self) -> Deferred: return run_as_background_process( "update_recently_accessed_media", self._update_recently_accessed ) + def _start_apply_media_retention_rules(self) -> Deferred: + return run_as_background_process( + "apply_media_retention_rules", self._apply_media_retention_rules + ) + async def _update_recently_accessed(self) -> None: remote_media = self.recently_accessed_remotes self.recently_accessed_remotes = set() @@ -835,6 +861,48 @@ async def _generate_thumbnails( return {"width": m_width, "height": m_height} + async def _apply_media_retention_rules(self) -> None: + """ + Purge old local and remote media according to the media retention rules + defined in the homeserver config. + + Raises: + ... + """ + # Purge remote media + if self._media_retention_remote_media_lifetime_ms is not None: + # Calculate a threshold timestamp derived from the configured lifetime. Any + # media that has not been accessed since this timestamp will be removed. + remote_media_threshold_timestamp_ms = ( + self.clock.time_msec() - self._media_retention_remote_media_lifetime_ms + ) + + logger.info( + "Purging remote media last accessed before" + f" {remote_media_threshold_timestamp_ms}" + ) + + await self.delete_old_remote_media( + before_ts=remote_media_threshold_timestamp_ms + ) + + # And now do the same for local media + if self._media_retention_local_media_lifetime_ms is not None: + # This works the same as the remote media threshold + local_media_threshold_timestamp_ms = ( + self.clock.time_msec() - self._media_retention_local_media_lifetime_ms + ) + + logger.info( + "Purging local media last accessed before" + f" {local_media_threshold_timestamp_ms}" + ) + + await self.delete_old_local_media( + before_ts=local_media_threshold_timestamp_ms, + keep_profiles=True, + ) + async def delete_old_remote_media(self, before_ts: int) -> Dict[str, int]: old_media = await self.store.get_remote_media_before(before_ts) From f7f556edb4206cdf0ee574a1f619458e2db5654f Mon Sep 17 00:00:00 2001 From: Andrew Morgan Date: Fri, 13 May 2022 14:53:06 +0100 Subject: [PATCH 03/14] Generate sample config --- docs/sample_config.yaml | 33 +++++++++++++++++++++++++++++++++ 1 file changed, 33 insertions(+) diff --git a/docs/sample_config.yaml b/docs/sample_config.yaml index e7b57f5a0bdf..ee4b72c90802 100644 --- a/docs/sample_config.yaml +++ b/docs/sample_config.yaml @@ -1043,6 +1043,39 @@ media_store_path: "DATADIR/media_store" # height: 600 # method: scale +# Configure media retention settings. Media will be purged if it +# has not been accessed in at least this amount of time. If the +# media has never been access, the media's creation time is used +# instead. Both thumbnails and the original media will be removed. +# +# Media is 'accessed' when loaded in a room in a client, or +# otherwise downloaded by a local or remote user. +# +media_retention: + # Whether media retention settings should apply. Defaults to + # false. + # + # Uncomment to enable media retention on this homeserver. + # + #enabled: true + + # How long to keep local media since its last access. Local + # media that is removed will be permanently deleted. + # + # If this option is not set, local media will not have a + # retention policy applied. + # + #local_media_lifetime: 30d + + # How long to keep downloaded remote media since its last + # access. Remote media will be downloaded again from the + # originating server on demand. + # + # If this option is not set, remote media will not have a + # retention policy applied. + # + remote_media_lifetime: 7d + # Is the preview URL API enabled? # # 'false' by default: uncomment the following to enable it (and specify a From fb04e7ad4e682e3504075c508e30785a36ab1994 Mon Sep 17 00:00:00 2001 From: Andrew Morgan Date: Mon, 16 May 2022 17:39:27 +0100 Subject: [PATCH 04/14] Document that media purging will be done by the media worker --- docs/sample_config.yaml | 6 +++++- synapse/config/repository.py | 6 +++++- 2 files changed, 10 insertions(+), 2 deletions(-) diff --git a/docs/sample_config.yaml b/docs/sample_config.yaml index ee4b72c90802..408d1bcafe4c 100644 --- a/docs/sample_config.yaml +++ b/docs/sample_config.yaml @@ -1045,12 +1045,16 @@ media_store_path: "DATADIR/media_store" # Configure media retention settings. Media will be purged if it # has not been accessed in at least this amount of time. If the -# media has never been access, the media's creation time is used +# media has never been accessed, the media's creation time is used # instead. Both thumbnails and the original media will be removed. # # Media is 'accessed' when loaded in a room in a client, or # otherwise downloaded by a local or remote user. # +# Purging the media will be the carried out by the media worker +# (whichever worker has the 'enable_media_repo' homeserver config +# option enabled). This may be the main process. +# media_retention: # Whether media retention settings should apply. Defaults to # false. diff --git a/synapse/config/repository.py b/synapse/config/repository.py index cac6bf63f9a4..59315bf0264d 100644 --- a/synapse/config/repository.py +++ b/synapse/config/repository.py @@ -308,12 +308,16 @@ def generate_config_section(self, data_dir_path: str, **kwargs: Any) -> str: # Configure media retention settings. Media will be purged if it # has not been accessed in at least this amount of time. If the - # media has never been access, the media's creation time is used + # media has never been accessed, the media's creation time is used # instead. Both thumbnails and the original media will be removed. # # Media is 'accessed' when loaded in a room in a client, or # otherwise downloaded by a local or remote user. # + # Purging the media will be the carried out by the media worker + # (whichever worker has the 'enable_media_repo' homeserver config + # option enabled). This may be the main process. + # media_retention: # Whether media retention settings should apply. Defaults to # false. From 35839a0782ca5f5c50ad125d915052ab1af6b6ee Mon Sep 17 00:00:00 2001 From: Andrew Morgan Date: Mon, 16 May 2022 18:58:55 +0100 Subject: [PATCH 05/14] Move the config option documentation to the config docs --- docs/sample_config.yaml | 37 ------------------- .../configuration/config_documentation.md | 33 ++++++++++++++++- synapse/config/repository.py | 37 ------------------- 3 files changed, 32 insertions(+), 75 deletions(-) diff --git a/docs/sample_config.yaml b/docs/sample_config.yaml index 408d1bcafe4c..e7b57f5a0bdf 100644 --- a/docs/sample_config.yaml +++ b/docs/sample_config.yaml @@ -1043,43 +1043,6 @@ media_store_path: "DATADIR/media_store" # height: 600 # method: scale -# Configure media retention settings. Media will be purged if it -# has not been accessed in at least this amount of time. If the -# media has never been accessed, the media's creation time is used -# instead. Both thumbnails and the original media will be removed. -# -# Media is 'accessed' when loaded in a room in a client, or -# otherwise downloaded by a local or remote user. -# -# Purging the media will be the carried out by the media worker -# (whichever worker has the 'enable_media_repo' homeserver config -# option enabled). This may be the main process. -# -media_retention: - # Whether media retention settings should apply. Defaults to - # false. - # - # Uncomment to enable media retention on this homeserver. - # - #enabled: true - - # How long to keep local media since its last access. Local - # media that is removed will be permanently deleted. - # - # If this option is not set, local media will not have a - # retention policy applied. - # - #local_media_lifetime: 30d - - # How long to keep downloaded remote media since its last - # access. Remote media will be downloaded again from the - # originating server on demand. - # - # If this option is not set, remote media will not have a - # retention policy applied. - # - remote_media_lifetime: 7d - # Is the preview URL API enabled? # # 'false' by default: uncomment the following to enable it (and specify a diff --git a/docs/usage/configuration/config_documentation.md b/docs/usage/configuration/config_documentation.md index f292b94fb0cd..1aa240eda641 100644 --- a/docs/usage/configuration/config_documentation.md +++ b/docs/usage/configuration/config_documentation.md @@ -1407,7 +1407,7 @@ federation_rr_transactions_per_room_per_second: 40 ``` --- ## Media Store ## -Config options relating to Synapse media store. +Config options related to Synapse's media store. --- Config option: `enable_media_repo` @@ -1511,6 +1511,37 @@ thumbnail_sizes: height: 600 method: scale ``` +--- +Config option: `media_retention` + +Controls whether local media and entries in the remote media cache +(media that is downloaded from other homeservers) should be removed +under certain conditions, typically for the purpose of saving space. + +Purging media files will be the carried out by the media worker +(that is, the worker that has the `enable_media_repo` homeserver config +option set to 'true'). This may be the main process. + +The `media_retention.enabled` option globally controls whether media +retention is enabled. + +The `media_retention.local_media_lifetime` and +`media_retention.remote_media_lifetime` config options control whether +media will be purged if it has not been accessed in a given amount of +time. Note that media is 'accessed' when loaded in a room in a client, or +otherwise downloaded by a local or remote user. If the media has never +been accessed, the media's creation time is used instead. Both thumbnails +and the original media will be removed. If either of these options are unset, +then media of that type will not be purged. + +Example configuration: +```yaml +media_retention: + enabled: true + local_media_lifetime: 30d + remote_media_lifetime: 7d +``` +--- Config option: `url_preview_enabled` This setting determines whether the preview URL API is enabled. diff --git a/synapse/config/repository.py b/synapse/config/repository.py index 59315bf0264d..ff6e50cc7cd7 100644 --- a/synapse/config/repository.py +++ b/synapse/config/repository.py @@ -306,43 +306,6 @@ def generate_config_section(self, data_dir_path: str, **kwargs: Any) -> str: #thumbnail_sizes: %(formatted_thumbnail_sizes)s - # Configure media retention settings. Media will be purged if it - # has not been accessed in at least this amount of time. If the - # media has never been accessed, the media's creation time is used - # instead. Both thumbnails and the original media will be removed. - # - # Media is 'accessed' when loaded in a room in a client, or - # otherwise downloaded by a local or remote user. - # - # Purging the media will be the carried out by the media worker - # (whichever worker has the 'enable_media_repo' homeserver config - # option enabled). This may be the main process. - # - media_retention: - # Whether media retention settings should apply. Defaults to - # false. - # - # Uncomment to enable media retention on this homeserver. - # - #enabled: true - - # How long to keep local media since its last access. Local - # media that is removed will be permanently deleted. - # - # If this option is not set, local media will not have a - # retention policy applied. - # - #local_media_lifetime: 30d - - # How long to keep downloaded remote media since its last - # access. Remote media will be downloaded again from the - # originating server on demand. - # - # If this option is not set, remote media will not have a - # retention policy applied. - # - remote_media_lifetime: 7d - # Is the preview URL API enabled? # # 'false' by default: uncomment the following to enable it (and specify a From eb6ed0315c79e5de41be5dc9c6104149361f8d53 Mon Sep 17 00:00:00 2001 From: Andrew Morgan Date: Mon, 16 May 2022 20:16:58 +0100 Subject: [PATCH 06/14] Add media_retention.purge_period to configure how often to purge --- docs/usage/configuration/config_documentation.md | 6 ++++++ synapse/config/repository.py | 4 ++++ synapse/rest/media/v1/media_repository.py | 9 +++------ 3 files changed, 13 insertions(+), 6 deletions(-) diff --git a/docs/usage/configuration/config_documentation.md b/docs/usage/configuration/config_documentation.md index 1aa240eda641..fd7f9e8fa64e 100644 --- a/docs/usage/configuration/config_documentation.md +++ b/docs/usage/configuration/config_documentation.md @@ -1525,6 +1525,11 @@ option set to 'true'). This may be the main process. The `media_retention.enabled` option globally controls whether media retention is enabled. +The `media_retention.purge_period` option dictates how often Synapse should +scan and purge media to be removed according to the configured thresholds. +For example, if set to "6h", Synapse will check every 6 hours for media +that can be purged. The default value is "24h" meaning 24 hours. + The `media_retention.local_media_lifetime` and `media_retention.remote_media_lifetime` config options control whether media will be purged if it has not been accessed in a given amount of @@ -1538,6 +1543,7 @@ Example configuration: ```yaml media_retention: enabled: true + purge_period: 24h local_media_lifetime: 30d remote_media_lifetime: 7d ``` diff --git a/synapse/config/repository.py b/synapse/config/repository.py index ff6e50cc7cd7..c207afae9e23 100644 --- a/synapse/config/repository.py +++ b/synapse/config/repository.py @@ -226,6 +226,10 @@ def read_config(self, config: JsonDict, **kwargs: Any) -> None: media_retention = config.get("media_retention") or {} self.media_retention_enabled = media_retention.get("enabled", False) + self.media_retention_purge_period = self.parse_duration( + media_retention.get("purge_period", "24h") + ) + self.media_retention_local_media_lifetime_ms = None local_media_lifetime = media_retention.get("local_media_lifetime") if local_media_lifetime is not None: diff --git a/synapse/rest/media/v1/media_repository.py b/synapse/rest/media/v1/media_repository.py index ca2f4b7340b1..24c83e010bbd 100644 --- a/synapse/rest/media/v1/media_repository.py +++ b/synapse/rest/media/v1/media_repository.py @@ -68,9 +68,6 @@ # How often to run the background job to update the "recently accessed" # attribute of local and remote media. UPDATE_RECENTLY_ACCESSED_TS = 60 * 1000 # 1 minute -# How often to run the background job that purges local and remote media -# according to the configured media retention rules. -APPLY_MEDIA_RETENTION_RULES_PERIOD_MS = 60 * 60 * 1000 # 1 hour class MediaRepository: @@ -136,11 +133,11 @@ def __init__(self, hs: "HomeServer"): ) if hs.config.media.media_retention_enabled: - # Run the background job to apply media retention rules every - # $APPLY_MEDIA_RETENTION_RULES_PERIOD_MS milliseconds. + # Run the background job to apply media retention rules routinely, + # with the duration between runs dictated by the homeserver config. self.clock.looping_call( self._start_apply_media_retention_rules, - APPLY_MEDIA_RETENTION_RULES_PERIOD_MS, + hs.config.media.media_retention_purge_period, ) def _start_update_recently_accessed(self) -> Deferred: From 61357eca78c0f66e2b5da141ea40cae7a90be285 Mon Sep 17 00:00:00 2001 From: Andrew Morgan Date: Wed, 18 May 2022 17:47:08 +0100 Subject: [PATCH 07/14] Remove the 'media_retention.enabled' config option --- docs/usage/configuration/config_documentation.md | 7 +++---- synapse/config/repository.py | 1 - synapse/rest/media/v1/media_repository.py | 6 +++++- 3 files changed, 8 insertions(+), 6 deletions(-) diff --git a/docs/usage/configuration/config_documentation.md b/docs/usage/configuration/config_documentation.md index fd7f9e8fa64e..62f125f700dc 100644 --- a/docs/usage/configuration/config_documentation.md +++ b/docs/usage/configuration/config_documentation.md @@ -1522,13 +1522,12 @@ Purging media files will be the carried out by the media worker (that is, the worker that has the `enable_media_repo` homeserver config option set to 'true'). This may be the main process. -The `media_retention.enabled` option globally controls whether media -retention is enabled. - The `media_retention.purge_period` option dictates how often Synapse should scan and purge media to be removed according to the configured thresholds. For example, if set to "6h", Synapse will check every 6 hours for media -that can be purged. The default value is "24h" meaning 24 hours. +that can be purged. The default value is "24h" meaning 24 hours. Synapse +will not regularly check for media to purge if no other media retention +options are set. The `media_retention.local_media_lifetime` and `media_retention.remote_media_lifetime` config options control whether diff --git a/synapse/config/repository.py b/synapse/config/repository.py index c207afae9e23..080e9f4bd48d 100644 --- a/synapse/config/repository.py +++ b/synapse/config/repository.py @@ -224,7 +224,6 @@ def read_config(self, config: JsonDict, **kwargs: Any) -> None: ) or ["en"] media_retention = config.get("media_retention") or {} - self.media_retention_enabled = media_retention.get("enabled", False) self.media_retention_purge_period = self.parse_duration( media_retention.get("purge_period", "24h") diff --git a/synapse/rest/media/v1/media_repository.py b/synapse/rest/media/v1/media_repository.py index 24c83e010bbd..eec6a5494c0a 100644 --- a/synapse/rest/media/v1/media_repository.py +++ b/synapse/rest/media/v1/media_repository.py @@ -132,7 +132,11 @@ def __init__(self, hs: "HomeServer"): hs.config.media.media_retention_remote_media_lifetime_ms ) - if hs.config.media.media_retention_enabled: + # Check whether local or remote media retention is configured + if ( + hs.config.media.media_retention_local_media_lifetime_ms is not None + or hs.config.media.media_retention_remote_media_lifetime_ms is not None + ): # Run the background job to apply media retention rules routinely, # with the duration between runs dictated by the homeserver config. self.clock.looping_call( From 1d859c91c83b7c206d63adef46cc7527d9b42b54 Mon Sep 17 00:00:00 2001 From: Andrew Morgan Date: Wed, 18 May 2022 18:18:48 +0100 Subject: [PATCH 08/14] Change the config example to non-default values --- docs/usage/configuration/config_documentation.md | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/docs/usage/configuration/config_documentation.md b/docs/usage/configuration/config_documentation.md index 62f125f700dc..c4f642d10047 100644 --- a/docs/usage/configuration/config_documentation.md +++ b/docs/usage/configuration/config_documentation.md @@ -1542,9 +1542,9 @@ Example configuration: ```yaml media_retention: enabled: true - purge_period: 24h - local_media_lifetime: 30d - remote_media_lifetime: 7d + purge_period: 7d + local_media_lifetime: 90d + remote_media_lifetime: 14d ``` --- Config option: `url_preview_enabled` From a05d67b7b2af97b40d9993054139813fffe67bf9 Mon Sep 17 00:00:00 2001 From: Andrew Morgan Date: Wed, 18 May 2022 18:20:15 +0100 Subject: [PATCH 09/14] Remove 'enabled' from the config documentation example --- docs/usage/configuration/config_documentation.md | 1 - 1 file changed, 1 deletion(-) diff --git a/docs/usage/configuration/config_documentation.md b/docs/usage/configuration/config_documentation.md index c4f642d10047..4ac6989d9ead 100644 --- a/docs/usage/configuration/config_documentation.md +++ b/docs/usage/configuration/config_documentation.md @@ -1541,7 +1541,6 @@ then media of that type will not be purged. Example configuration: ```yaml media_retention: - enabled: true purge_period: 7d local_media_lifetime: 90d remote_media_lifetime: 14d From d120931150f5d45d97919d37ffbbba730d8f0aa9 Mon Sep 17 00:00:00 2001 From: Andrew Morgan Date: Thu, 26 May 2022 16:24:55 +0100 Subject: [PATCH 10/14] Add tests for media retention --- tests/rest/media/test_media_retention.py | 238 +++++++++++++++++++++++ 1 file changed, 238 insertions(+) create mode 100644 tests/rest/media/test_media_retention.py diff --git a/tests/rest/media/test_media_retention.py b/tests/rest/media/test_media_retention.py new file mode 100644 index 000000000000..ca3d4216dcbd --- /dev/null +++ b/tests/rest/media/test_media_retention.py @@ -0,0 +1,238 @@ +# Copyright 2021 The Matrix.org Foundation C.I.C. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import io +from typing import Iterable, Optional, Tuple + +from twisted.test.proto_helpers import MemoryReactor + +from synapse.rest import admin +from synapse.rest.client import login, register, room +from synapse.server import HomeServer +from synapse.types import UserID +from synapse.util import Clock + +from tests import unittest +from tests.unittest import override_config +from tests.utils import MockClock + + +class MediaRetentionTestCase(unittest.HomeserverTestCase): + + ONE_DAY_IN_MS = 24 * 60 * 60 * 1000 + THIRTY_DAYS_IN_MS = 30 * ONE_DAY_IN_MS + + servlets = [ + room.register_servlets, + login.register_servlets, + register.register_servlets, + admin.register_servlets_for_client_rest_resource, + ] + + def make_homeserver(self, reactor: MemoryReactor, clock: Clock) -> HomeServer: + # We need to be able to test advancing time in the homeserver, so we + # replace the test homeserver's default clock with a MockClock, which + # supports advancing time. + return self.setup_test_homeserver(clock=MockClock()) + + def prepare(self, reactor: MemoryReactor, clock: Clock, hs: HomeServer) -> None: + self.remote_server_name = "remote.homeserver" + self.store = hs.get_datastores().main + + # Create a user to upload media with + test_user_id = self.register_user("alice", "password") + + # Inject media (3 images each; recently accessed, old access, never accessed) + # into both the local store and the remote cache + media_repository = hs.get_media_repository() + test_media_content = b"example string" + + def _create_media_and_set_last_accessed( + last_accessed_ms: Optional[int], + ) -> str: + # "Upload" some media to the local media store + mxc_uri = self.get_success( + media_repository.create_content( + media_type="text/plain", + upload_name=None, + content=io.BytesIO(test_media_content), + content_length=len(test_media_content), + auth_user=UserID.from_string(test_user_id), + ) + ) + + media_id = mxc_uri.split("/")[-1] + + # Set the last recently accessed time for this media + if last_accessed_ms is not None: + self.get_success( + self.store.update_cached_last_access_time( + local_media=(media_id,), + remote_media=(), + time_ms=last_accessed_ms, + ) + ) + + return media_id + + def _cache_remote_media_and_set_last_accessed( + media_id: str, last_accessed_ms: Optional[int] + ) -> str: + # Pretend to cache some remote media + self.get_success( + self.store.store_cached_remote_media( + origin=self.remote_server_name, + media_id=media_id, + media_type="text/plain", + media_length=1, + time_now_ms=clock.time_msec(), + upload_name="testfile.txt", + filesystem_id="abcdefg12345", + ) + ) + + # Set the last recently accessed time for this media + if last_accessed_ms is not None: + self.get_success( + hs.get_datastores().main.update_cached_last_access_time( + local_media=(), + remote_media=((self.remote_server_name, media_id),), + time_ms=last_accessed_ms, + ) + ) + + return media_id + + # Start with the local media store + self.local_recently_accessed_media = _create_media_and_set_last_accessed( + self.THIRTY_DAYS_IN_MS + ) + self.local_not_recently_accessed_media = _create_media_and_set_last_accessed( + self.ONE_DAY_IN_MS + ) + self.local_never_accessed_media = _create_media_and_set_last_accessed(None) + + # And now the remote media store + self.remote_recently_accessed_media = _cache_remote_media_and_set_last_accessed( + "a", self.THIRTY_DAYS_IN_MS + ) + self.remote_not_recently_accessed_media = ( + _cache_remote_media_and_set_last_accessed("b", self.ONE_DAY_IN_MS) + ) + # Remote media will always have a "last accessed" attribute, as it would not + # be fetched from the remote homeserver unless instigated by a user. + + @override_config( + { + "media_retention": { + # Enable retention for local media + "local_media_lifetime": "30d" + # Cached remote media should not be purged + } + } + ) + def test_local_media_retention(self) -> None: + """ + Tests that local media that have not been accessed recently is purged, while + cached remote media is unaffected. + """ + # Advance 31 days (in seconds) + self.reactor.advance(31 * 24 * 60 * 60) + + # Check that media has been correctly purged. + # Local media accessed <30 days ago should still exist. + # Remote media should be unaffected. + self._assert_if_mxc_uris_purged( + purged=[ + ( + self.hs.config.server.server_name, + self.local_not_recently_accessed_media, + ), + (self.hs.config.server.server_name, self.local_never_accessed_media), + ], + not_purged=[ + (self.hs.config.server.server_name, self.local_recently_accessed_media), + (self.remote_server_name, self.remote_recently_accessed_media), + (self.remote_server_name, self.remote_not_recently_accessed_media), + ], + ) + + @override_config( + { + "media_retention": { + # Enable retention for cached remote media + "remote_media_lifetime": "30d" + # Local media should not be purged + } + } + ) + def test_remote_media_cache_retention(self) -> None: + """ + Tests that entries from the remote media cache that have not been accessed + recently is purged, while local media is unaffected. + """ + # Advance 31 days (in seconds) + self.reactor.advance(31 * 24 * 60 * 60) + + # Check that media has been correctly purged. + # Local media should be unaffected. + # Remote media accessed <30 days ago should still exist. + self._assert_if_mxc_uris_purged( + purged=[ + (self.remote_server_name, self.remote_not_recently_accessed_media), + ], + not_purged=[ + (self.remote_server_name, self.remote_recently_accessed_media), + (self.hs.config.server.server_name, self.local_recently_accessed_media), + ( + self.hs.config.server.server_name, + self.local_not_recently_accessed_media, + ), + (self.hs.config.server.server_name, self.local_never_accessed_media), + ], + ) + + def _assert_if_mxc_uris_purged( + self, purged: Iterable[Tuple[str, str]], not_purged: Iterable[Tuple[str, str]] + ) -> None: + def _assert_mxc_uri_purge_state( + server_name: str, media_id: str, expect_purged: bool + ) -> None: + """Given an MXC URI, assert whether it has been purged or not.""" + if server_name == self.hs.config.server.server_name: + found_media_dict = self.get_success( + self.store.get_local_media(media_id) + ) + else: + found_media_dict = self.get_success( + self.store.get_cached_remote_media(server_name, media_id) + ) + + mxc_uri = f"mxc://{server_name}/{media_id}" + + if expect_purged: + self.assertIsNone( + found_media_dict, msg=f"{mxc_uri} unexpectedly not purged" + ) + else: + self.assertIsNotNone( + found_media_dict, + msg=f"{mxc_uri} unexpectedly purged", + ) + + # Assert that the given MXC URIs have either been correctly purged or not. + for server_name, media_id in purged: + _assert_mxc_uri_purge_state(server_name, media_id, expect_purged=True) + for server_name, media_id in not_purged: + _assert_mxc_uri_purge_state(server_name, media_id, expect_purged=False) From 66af2971cde758f68b219ae347177f6732796e40 Mon Sep 17 00:00:00 2001 From: Andrew Morgan Date: Thu, 26 May 2022 16:27:45 +0100 Subject: [PATCH 11/14] Changelog --- changelog.d/12732.feature | 1 + 1 file changed, 1 insertion(+) create mode 100644 changelog.d/12732.feature diff --git a/changelog.d/12732.feature b/changelog.d/12732.feature new file mode 100644 index 000000000000..3c73363d28d2 --- /dev/null +++ b/changelog.d/12732.feature @@ -0,0 +1 @@ +Add new `media_retention` options to the homeserver config for routinely cleaning up non-recently accessed media. \ No newline at end of file From 04cae77cf9f023ad6edb34247e39f8e8b94012aa Mon Sep 17 00:00:00 2001 From: Andrew Morgan Date: Thu, 26 May 2022 16:30:11 +0100 Subject: [PATCH 12/14] Remove empty Raises block in docstring --- synapse/rest/media/v1/media_repository.py | 3 --- 1 file changed, 3 deletions(-) diff --git a/synapse/rest/media/v1/media_repository.py b/synapse/rest/media/v1/media_repository.py index eec6a5494c0a..1efe50f14cc7 100644 --- a/synapse/rest/media/v1/media_repository.py +++ b/synapse/rest/media/v1/media_repository.py @@ -866,9 +866,6 @@ async def _apply_media_retention_rules(self) -> None: """ Purge old local and remote media according to the media retention rules defined in the homeserver config. - - Raises: - ... """ # Purge remote media if self._media_retention_remote_media_lifetime_ms is not None: From d7a3040f95016f404966cbc147ba99e715d009d2 Mon Sep 17 00:00:00 2001 From: Andrew Morgan Date: Tue, 31 May 2022 17:04:35 +0100 Subject: [PATCH 13/14] Remove purge_period option --- docs/usage/configuration/config_documentation.md | 8 -------- synapse/config/repository.py | 4 ---- synapse/rest/media/v1/media_repository.py | 5 ++++- 3 files changed, 4 insertions(+), 13 deletions(-) diff --git a/docs/usage/configuration/config_documentation.md b/docs/usage/configuration/config_documentation.md index 4ac6989d9ead..c6167740e83e 100644 --- a/docs/usage/configuration/config_documentation.md +++ b/docs/usage/configuration/config_documentation.md @@ -1522,13 +1522,6 @@ Purging media files will be the carried out by the media worker (that is, the worker that has the `enable_media_repo` homeserver config option set to 'true'). This may be the main process. -The `media_retention.purge_period` option dictates how often Synapse should -scan and purge media to be removed according to the configured thresholds. -For example, if set to "6h", Synapse will check every 6 hours for media -that can be purged. The default value is "24h" meaning 24 hours. Synapse -will not regularly check for media to purge if no other media retention -options are set. - The `media_retention.local_media_lifetime` and `media_retention.remote_media_lifetime` config options control whether media will be purged if it has not been accessed in a given amount of @@ -1541,7 +1534,6 @@ then media of that type will not be purged. Example configuration: ```yaml media_retention: - purge_period: 7d local_media_lifetime: 90d remote_media_lifetime: 14d ``` diff --git a/synapse/config/repository.py b/synapse/config/repository.py index 080e9f4bd48d..f9c55143c39d 100644 --- a/synapse/config/repository.py +++ b/synapse/config/repository.py @@ -225,10 +225,6 @@ def read_config(self, config: JsonDict, **kwargs: Any) -> None: media_retention = config.get("media_retention") or {} - self.media_retention_purge_period = self.parse_duration( - media_retention.get("purge_period", "24h") - ) - self.media_retention_local_media_lifetime_ms = None local_media_lifetime = media_retention.get("local_media_lifetime") if local_media_lifetime is not None: diff --git a/synapse/rest/media/v1/media_repository.py b/synapse/rest/media/v1/media_repository.py index 1efe50f14cc7..20af36653811 100644 --- a/synapse/rest/media/v1/media_repository.py +++ b/synapse/rest/media/v1/media_repository.py @@ -68,6 +68,9 @@ # How often to run the background job to update the "recently accessed" # attribute of local and remote media. UPDATE_RECENTLY_ACCESSED_TS = 60 * 1000 # 1 minute +# How often to run the background job to check for local and remote media +# that should be purged according to the configured media retention settings. +MEDIA_RETENTION_CHECK_PERIOD_MS = 60 * 60 * 1000 # 1 hour class MediaRepository: @@ -141,7 +144,7 @@ def __init__(self, hs: "HomeServer"): # with the duration between runs dictated by the homeserver config. self.clock.looping_call( self._start_apply_media_retention_rules, - hs.config.media.media_retention_purge_period, + MEDIA_RETENTION_CHECK_PERIOD_MS, ) def _start_update_recently_accessed(self) -> Deferred: From 9788cc7ed96dee1bc0b8737e7a788fe037fe531d Mon Sep 17 00:00:00 2001 From: Andrew Morgan Date: Tue, 31 May 2022 17:04:43 +0100 Subject: [PATCH 14/14] Remember that it's 2022 --- tests/rest/media/test_media_retention.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/rest/media/test_media_retention.py b/tests/rest/media/test_media_retention.py index ca3d4216dcbd..b98a5cd586f0 100644 --- a/tests/rest/media/test_media_retention.py +++ b/tests/rest/media/test_media_retention.py @@ -1,4 +1,4 @@ -# Copyright 2021 The Matrix.org Foundation C.I.C. +# Copyright 2022 The Matrix.org Foundation C.I.C. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License.