Browse Source

Reload cache factors from disk on SIGHUP (#12673)

David Robertson 1 year ago
parent
commit
d38d242411

+ 1 - 0
changelog.d/12673.feature

@@ -0,0 +1 @@
+Synapse will now reload [cache config](https://matrix-org.github.io/synapse/latest/usage/configuration/config_documentation.html#caching) when it receives a [SIGHUP](https://en.wikipedia.org/wiki/SIGHUP) signal.

+ 6 - 0
docs/sample_config.yaml

@@ -730,6 +730,12 @@ retention:
 # A cache 'factor' is a multiplier that can be applied to each of
 # Synapse's caches in order to increase or decrease the maximum
 # number of entries that can be stored.
+#
+# The configuration for cache factors (caches.global_factor and
+# caches.per_cache_factors) can be reloaded while the application is running,
+# by sending a SIGHUP signal to the Synapse process. Changes to other parts of
+# the caching config will NOT be applied after a SIGHUP is received; a restart
+# is necessary.
 
 # The number of events to cache in memory. Not affected by
 # caches.global_factor.

+ 17 - 0
docs/usage/configuration/config_documentation.md

@@ -1130,6 +1130,23 @@ caches:
   expire_caches: false
   sync_response_cache_duration: 2m
 ```
+
+### Reloading cache factors
+
+The cache factors (i.e. `caches.global_factor` and `caches.per_cache_factors`)  may be reloaded at any time by sending a
+[`SIGHUP`](https://en.wikipedia.org/wiki/SIGHUP) signal to Synapse using e.g.
+
+```commandline
+kill -HUP [PID_OF_SYNAPSE_PROCESS]
+```
+
+If you are running multiple workers, you must individually update the worker 
+config file and send this signal to each worker process.
+
+If you're using the [example systemd service](https://github.com/matrix-org/synapse/blob/develop/contrib/systemd/matrix-synapse.service)
+file in Synapse's `contrib` directory, you can send a `SIGHUP` signal by using
+`systemctl reload matrix-synapse`.
+
 ---
 ## Database ##
 Config options related to database settings.

+ 44 - 0
synapse/app/_base.py

@@ -49,9 +49,12 @@ from twisted.logger import LoggingFile, LogLevel
 from twisted.protocols.tls import TLSMemoryBIOFactory
 from twisted.python.threadpool import ThreadPool
 
+import synapse.util.caches
 from synapse.api.constants import MAX_PDU_SIZE
 from synapse.app import check_bind_error
 from synapse.app.phone_stats_home import start_phone_stats_home
+from synapse.config import ConfigError
+from synapse.config._base import format_config_error
 from synapse.config.homeserver import HomeServerConfig
 from synapse.config.server import ManholeConfig
 from synapse.crypto import context_factory
@@ -432,6 +435,10 @@ async def start(hs: "HomeServer") -> None:
         signal.signal(signal.SIGHUP, run_sighup)
 
         register_sighup(refresh_certificate, hs)
+        register_sighup(reload_cache_config, hs.config)
+
+    # Apply the cache config.
+    hs.config.caches.resize_all_caches()
 
     # Load the certificate from disk.
     refresh_certificate(hs)
@@ -486,6 +493,43 @@ async def start(hs: "HomeServer") -> None:
         atexit.register(gc.freeze)
 
 
+def reload_cache_config(config: HomeServerConfig) -> None:
+    """Reload cache config from disk and immediately apply it.resize caches accordingly.
+
+    If the config is invalid, a `ConfigError` is logged and no changes are made.
+
+    Otherwise, this:
+        - replaces the `caches` section on the given `config` object,
+        - resizes all caches according to the new cache factors, and
+
+    Note that the following cache config keys are read, but not applied:
+        - event_cache_size: used to set a max_size and _original_max_size on
+              EventsWorkerStore._get_event_cache when it is created. We'd have to update
+              the _original_max_size (and maybe
+        - sync_response_cache_duration: would have to update the timeout_sec attribute on
+              HomeServer ->  SyncHandler -> ResponseCache.
+        - track_memory_usage. This affects synapse.util.caches.TRACK_MEMORY_USAGE which
+              influences Synapse's self-reported metrics.
+
+    Also, the HTTPConnectionPool in SimpleHTTPClient sets its maxPersistentPerHost
+    parameter based on the global_factor. This won't be applied on a config reload.
+    """
+    try:
+        previous_cache_config = config.reload_config_section("caches")
+    except ConfigError as e:
+        logger.warning("Failed to reload cache config")
+        for f in format_config_error(e):
+            logger.warning(f)
+    else:
+        logger.debug(
+            "New cache config. Was:\n %s\nNow:\n",
+            previous_cache_config.__dict__,
+            config.caches.__dict__,
+        )
+        synapse.util.caches.TRACK_MEMORY_USAGE = config.caches.track_memory_usage
+        config.caches.resize_all_caches()
+
+
 def setup_sentry(hs: "HomeServer") -> None:
     """Enable sentry integration, if enabled in configuration"""
 

+ 2 - 34
synapse/app/homeserver.py

@@ -16,7 +16,7 @@
 import logging
 import os
 import sys
-from typing import Dict, Iterable, Iterator, List
+from typing import Dict, Iterable, List
 
 from matrix_common.versionstring import get_distribution_version_string
 
@@ -45,7 +45,7 @@ from synapse.app._base import (
     redirect_stdio_to_logs,
     register_start,
 )
-from synapse.config._base import ConfigError
+from synapse.config._base import ConfigError, format_config_error
 from synapse.config.emailconfig import ThreepidBehaviour
 from synapse.config.homeserver import HomeServerConfig
 from synapse.config.server import ListenerConfig
@@ -399,38 +399,6 @@ def setup(config_options: List[str]) -> SynapseHomeServer:
     return hs
 
 
-def format_config_error(e: ConfigError) -> Iterator[str]:
-    """
-    Formats a config error neatly
-
-    The idea is to format the immediate error, plus the "causes" of those errors,
-    hopefully in a way that makes sense to the user. For example:
-
-        Error in configuration at 'oidc_config.user_mapping_provider.config.display_name_template':
-          Failed to parse config for module 'JinjaOidcMappingProvider':
-            invalid jinja template:
-              unexpected end of template, expected 'end of print statement'.
-
-    Args:
-        e: the error to be formatted
-
-    Returns: An iterator which yields string fragments to be formatted
-    """
-    yield "Error in configuration"
-
-    if e.path:
-        yield " at '%s'" % (".".join(e.path),)
-
-    yield ":\n  %s" % (e.msg,)
-
-    parent_e = e.__cause__
-    indent = 1
-    while parent_e:
-        indent += 1
-        yield ":\n%s%s" % ("  " * indent, str(parent_e))
-        parent_e = parent_e.__cause__
-
-
 def run(hs: HomeServer) -> None:
     _base.start_reactor(
         "synapse-homeserver",

+ 74 - 7
synapse/config/_base.py

@@ -16,14 +16,18 @@
 
 import argparse
 import errno
+import logging
 import os
 from collections import OrderedDict
 from hashlib import sha256
 from textwrap import dedent
 from typing import (
     Any,
+    ClassVar,
+    Collection,
     Dict,
     Iterable,
+    Iterator,
     List,
     MutableMapping,
     Optional,
@@ -40,6 +44,8 @@ import yaml
 
 from synapse.util.templates import _create_mxc_to_http_filter, _format_ts_filter
 
+logger = logging.getLogger(__name__)
+
 
 class ConfigError(Exception):
     """Represents a problem parsing the configuration
@@ -55,6 +61,38 @@ class ConfigError(Exception):
         self.path = path
 
 
+def format_config_error(e: ConfigError) -> Iterator[str]:
+    """
+    Formats a config error neatly
+
+    The idea is to format the immediate error, plus the "causes" of those errors,
+    hopefully in a way that makes sense to the user. For example:
+
+        Error in configuration at 'oidc_config.user_mapping_provider.config.display_name_template':
+          Failed to parse config for module 'JinjaOidcMappingProvider':
+            invalid jinja template:
+              unexpected end of template, expected 'end of print statement'.
+
+    Args:
+        e: the error to be formatted
+
+    Returns: An iterator which yields string fragments to be formatted
+    """
+    yield "Error in configuration"
+
+    if e.path:
+        yield " at '%s'" % (".".join(e.path),)
+
+    yield ":\n  %s" % (e.msg,)
+
+    parent_e = e.__cause__
+    indent = 1
+    while parent_e:
+        indent += 1
+        yield ":\n%s%s" % ("  " * indent, str(parent_e))
+        parent_e = parent_e.__cause__
+
+
 # We split these messages out to allow packages to override with package
 # specific instructions.
 MISSING_REPORT_STATS_CONFIG_INSTRUCTIONS = """\
@@ -119,7 +157,7 @@ class Config:
             defined in subclasses.
     """
 
-    section: str
+    section: ClassVar[str]
 
     def __init__(self, root_config: "RootConfig" = None):
         self.root = root_config
@@ -309,9 +347,12 @@ class RootConfig:
     class, lower-cased and with "Config" removed.
     """
 
-    config_classes = []
+    config_classes: List[Type[Config]] = []
+
+    def __init__(self, config_files: Collection[str] = ()):
+        # Capture absolute paths here, so we can reload config after we daemonize.
+        self.config_files = [os.path.abspath(path) for path in config_files]
 
-    def __init__(self):
         for config_class in self.config_classes:
             if config_class.section is None:
                 raise ValueError("%r requires a section name" % (config_class,))
@@ -512,12 +553,10 @@ class RootConfig:
             object from parser.parse_args(..)`
         """
 
-        obj = cls()
-
         config_args = parser.parse_args(argv)
 
         config_files = find_config_files(search_paths=config_args.config_path)
-
+        obj = cls(config_files)
         if not config_files:
             parser.error("Must supply a config file.")
 
@@ -627,7 +666,7 @@ class RootConfig:
 
         generate_missing_configs = config_args.generate_missing_configs
 
-        obj = cls()
+        obj = cls(config_files)
 
         if config_args.generate_config:
             if config_args.report_stats is None:
@@ -727,6 +766,34 @@ class RootConfig:
     ) -> None:
         self.invoke_all("generate_files", config_dict, config_dir_path)
 
+    def reload_config_section(self, section_name: str) -> Config:
+        """Reconstruct the given config section, leaving all others unchanged.
+
+        This works in three steps:
+
+        1. Create a new instance of the relevant `Config` subclass.
+        2. Call `read_config` on that instance to parse the new config.
+        3. Replace the existing config instance with the new one.
+
+        :raises ValueError: if the given `section` does not exist.
+        :raises ConfigError: for any other problems reloading config.
+
+        :returns: the previous config object, which no longer has a reference to this
+            RootConfig.
+        """
+        existing_config: Optional[Config] = getattr(self, section_name, None)
+        if existing_config is None:
+            raise ValueError(f"Unknown config section '{section_name}'")
+        logger.info("Reloading config section '%s'", section_name)
+
+        new_config_data = read_config_files(self.config_files)
+        new_config = type(existing_config)(self)
+        new_config.read_config(new_config_data)
+        setattr(self, section_name, new_config)
+
+        existing_config.root = None
+        return existing_config
+
 
 def read_config_files(config_files: Iterable[str]) -> Dict[str, Any]:
     """Read the config files into a dict

+ 14 - 1
synapse/config/_base.pyi

@@ -1,15 +1,19 @@
 import argparse
 from typing import (
     Any,
+    Collection,
     Dict,
     Iterable,
+    Iterator,
     List,
+    Literal,
     MutableMapping,
     Optional,
     Tuple,
     Type,
     TypeVar,
     Union,
+    overload,
 )
 
 import jinja2
@@ -64,6 +68,8 @@ class ConfigError(Exception):
         self.msg = msg
         self.path = path
 
+def format_config_error(e: ConfigError) -> Iterator[str]: ...
+
 MISSING_REPORT_STATS_CONFIG_INSTRUCTIONS: str
 MISSING_REPORT_STATS_SPIEL: str
 MISSING_SERVER_NAME: str
@@ -117,7 +123,8 @@ class RootConfig:
     background_updates: background_updates.BackgroundUpdateConfig
 
     config_classes: List[Type["Config"]] = ...
-    def __init__(self) -> None: ...
+    config_files: List[str]
+    def __init__(self, config_files: Collection[str] = ...) -> None: ...
     def invoke_all(
         self, func_name: str, *args: Any, **kwargs: Any
     ) -> MutableMapping[str, Any]: ...
@@ -157,6 +164,12 @@ class RootConfig:
     def generate_missing_files(
         self, config_dict: dict, config_dir_path: str
     ) -> None: ...
+    @overload
+    def reload_config_section(
+        self, section_name: Literal["caches"]
+    ) -> cache.CacheConfig: ...
+    @overload
+    def reload_config_section(self, section_name: str) -> Config: ...
 
 class Config:
     root: RootConfig

+ 31 - 18
synapse/config/cache.py

@@ -69,11 +69,11 @@ def _canonicalise_cache_name(cache_name: str) -> str:
 def add_resizable_cache(
     cache_name: str, cache_resize_callback: Callable[[float], None]
 ) -> None:
-    """Register a cache that's size can dynamically change
+    """Register a cache whose size can dynamically change
 
     Args:
         cache_name: A reference to the cache
-        cache_resize_callback: A callback function that will be ran whenever
+        cache_resize_callback: A callback function that will run whenever
             the cache needs to be resized
     """
     # Some caches have '*' in them which we strip out.
@@ -96,6 +96,13 @@ class CacheConfig(Config):
     section = "caches"
     _environ = os.environ
 
+    event_cache_size: int
+    cache_factors: Dict[str, float]
+    global_factor: float
+    track_memory_usage: bool
+    expiry_time_msec: Optional[int]
+    sync_response_cache_duration: int
+
     @staticmethod
     def reset() -> None:
         """Resets the caches to their defaults. Used for tests."""
@@ -115,6 +122,12 @@ class CacheConfig(Config):
         # A cache 'factor' is a multiplier that can be applied to each of
         # Synapse's caches in order to increase or decrease the maximum
         # number of entries that can be stored.
+        #
+        # The configuration for cache factors (caches.global_factor and
+        # caches.per_cache_factors) can be reloaded while the application is running,
+        # by sending a SIGHUP signal to the Synapse process. Changes to other parts of
+        # the caching config will NOT be applied after a SIGHUP is received; a restart
+        # is necessary.
 
         # The number of events to cache in memory. Not affected by
         # caches.global_factor.
@@ -174,21 +187,21 @@ class CacheConfig(Config):
         """
 
     def read_config(self, config: JsonDict, **kwargs: Any) -> None:
+        """Populate this config object with values from `config`.
+
+        This method does NOT resize existing or future caches: use `resize_all_caches`.
+        We use two separate methods so that we can reject bad config before applying it.
+        """
         self.event_cache_size = self.parse_size(
             config.get("event_cache_size", _DEFAULT_EVENT_CACHE_SIZE)
         )
-        self.cache_factors: Dict[str, float] = {}
+        self.cache_factors = {}
 
         cache_config = config.get("caches") or {}
-        self.global_factor = cache_config.get(
-            "global_factor", properties.default_factor_size
-        )
+        self.global_factor = cache_config.get("global_factor", _DEFAULT_FACTOR_SIZE)
         if not isinstance(self.global_factor, (int, float)):
             raise ConfigError("caches.global_factor must be a number.")
 
-        # Set the global one so that it's reflected in new caches
-        properties.default_factor_size = self.global_factor
-
         # Load cache factors from the config
         individual_factors = cache_config.get("per_cache_factors") or {}
         if not isinstance(individual_factors, dict):
@@ -230,7 +243,7 @@ class CacheConfig(Config):
         cache_entry_ttl = cache_config.get("cache_entry_ttl", "30m")
 
         if expire_caches:
-            self.expiry_time_msec: Optional[int] = self.parse_duration(cache_entry_ttl)
+            self.expiry_time_msec = self.parse_duration(cache_entry_ttl)
         else:
             self.expiry_time_msec = None
 
@@ -254,19 +267,19 @@ class CacheConfig(Config):
             cache_config.get("sync_response_cache_duration", 0)
         )
 
-        # Resize all caches (if necessary) with the new factors we've loaded
-        self.resize_all_caches()
-
-        # Store this function so that it can be called from other classes without
-        # needing an instance of Config
-        properties.resize_all_caches_func = self.resize_all_caches
-
     def resize_all_caches(self) -> None:
-        """Ensure all cache sizes are up to date
+        """Ensure all cache sizes are up-to-date.
 
         For each cache, run the mapped callback function with either
         a specific cache factor or the default, global one.
         """
+        # Set the global factor size, so that new caches are appropriately sized.
+        properties.default_factor_size = self.global_factor
+
+        # Store this function so that it can be called from other classes without
+        # needing an instance of CacheConfig
+        properties.resize_all_caches_func = self.resize_all_caches
+
         # block other threads from modifying _CACHES while we iterate it.
         with _CACHES_LOCK:
             for cache_name, callback in _CACHES.items():

+ 1 - 1
synapse/http/client.py

@@ -348,7 +348,7 @@ class SimpleHttpClient:
         # XXX: The justification for using the cache factor here is that larger instances
         # will need both more cache and more connections.
         # Still, this should probably be a separate dial
-        pool.maxPersistentPerHost = max((100 * hs.config.caches.global_factor, 5))
+        pool.maxPersistentPerHost = max(int(100 * hs.config.caches.global_factor), 5)
         pool.cachedConnectionTimeout = 2 * 60
 
         self.agent: IAgent = ProxyAgent(

+ 8 - 0
tests/config/test_cache.py

@@ -38,6 +38,7 @@ class CacheConfigTests(TestCase):
             "SYNAPSE_NOT_CACHE": "BLAH",
         }
         self.config.read_config(config, config_dir_path="", data_dir_path="")
+        self.config.resize_all_caches()
 
         self.assertEqual(dict(self.config.cache_factors), {"something_or_other": 2.0})
 
@@ -52,6 +53,7 @@ class CacheConfigTests(TestCase):
             "SYNAPSE_CACHE_FACTOR_FOO": 1,
         }
         self.config.read_config(config, config_dir_path="", data_dir_path="")
+        self.config.resize_all_caches()
 
         self.assertEqual(
             dict(self.config.cache_factors),
@@ -71,6 +73,7 @@ class CacheConfigTests(TestCase):
 
         config = {"caches": {"per_cache_factors": {"foo": 3}}}
         self.config.read_config(config)
+        self.config.resize_all_caches()
 
         self.assertEqual(cache.max_size, 300)
 
@@ -82,6 +85,7 @@ class CacheConfigTests(TestCase):
         """
         config = {"caches": {"per_cache_factors": {"foo": 2}}}
         self.config.read_config(config, config_dir_path="", data_dir_path="")
+        self.config.resize_all_caches()
 
         cache = LruCache(100)
         add_resizable_cache("foo", cache_resize_callback=cache.set_cache_factor)
@@ -99,6 +103,7 @@ class CacheConfigTests(TestCase):
 
         config = {"caches": {"global_factor": 4}}
         self.config.read_config(config, config_dir_path="", data_dir_path="")
+        self.config.resize_all_caches()
 
         self.assertEqual(cache.max_size, 400)
 
@@ -110,6 +115,7 @@ class CacheConfigTests(TestCase):
         """
         config = {"caches": {"global_factor": 1.5}}
         self.config.read_config(config, config_dir_path="", data_dir_path="")
+        self.config.resize_all_caches()
 
         cache = LruCache(100)
         add_resizable_cache("foo", cache_resize_callback=cache.set_cache_factor)
@@ -128,6 +134,7 @@ class CacheConfigTests(TestCase):
             "SYNAPSE_CACHE_FACTOR_CACHE_B": 3,
         }
         self.config.read_config(config, config_dir_path="", data_dir_path="")
+        self.config.resize_all_caches()
 
         cache_a = LruCache(100)
         add_resizable_cache("*cache_a*", cache_resize_callback=cache_a.set_cache_factor)
@@ -148,6 +155,7 @@ class CacheConfigTests(TestCase):
 
         config = {"caches": {"event_cache_size": "10k"}}
         self.config.read_config(config, config_dir_path="", data_dir_path="")
+        self.config.resize_all_caches()
 
         cache = LruCache(
             max_size=self.config.event_cache_size,

+ 1 - 0
tests/server.py

@@ -749,6 +749,7 @@ def setup_test_homeserver(
     if config is None:
         config = default_config(name, parse=True)
 
+    config.caches.resize_all_caches()
     config.ldap_enabled = False
 
     if "clock" not in kwargs: