Parcourir la source

Move some replication processing out of generic_worker (#9796)

Co-authored-by: Richard van der Hoff <1389908+richvdh@users.noreply.github.com>
Erik Johnston il y a 3 ans
Parent
commit
00a6db9676

+ 1 - 0
changelog.d/9796.misc

@@ -0,0 +1 @@
+Move some replication processing out of `generic_worker`.

+ 2 - 468
synapse/app/generic_worker.py

@@ -13,12 +13,9 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-import contextlib
 import logging
 import sys
-from typing import Dict, Iterable, Optional, Set
-
-from typing_extensions import ContextManager
+from typing import Dict, Iterable, Optional
 
 from twisted.internet import address
 from twisted.web.resource import IResource
@@ -40,24 +37,13 @@ from synapse.config._base import ConfigError
 from synapse.config.homeserver import HomeServerConfig
 from synapse.config.logger import setup_logging
 from synapse.config.server import ListenerConfig
-from synapse.federation import send_queue
 from synapse.federation.transport.server import TransportLayerServer
-from synapse.handlers.presence import (
-    BasePresenceHandler,
-    PresenceState,
-    get_interested_parties,
-)
 from synapse.http.server import JsonResource, OptionsResource
 from synapse.http.servlet import RestServlet, parse_json_object_from_request
 from synapse.http.site import SynapseSite
 from synapse.logging.context import LoggingContext
 from synapse.metrics import METRICS_PREFIX, MetricsResource, RegistryProxy
-from synapse.metrics.background_process_metrics import run_as_background_process
 from synapse.replication.http import REPLICATION_PREFIX, ReplicationRestResource
-from synapse.replication.http.presence import (
-    ReplicationBumpPresenceActiveTime,
-    ReplicationPresenceSetState,
-)
 from synapse.replication.slave.storage._base import BaseSlavedStore
 from synapse.replication.slave.storage.account_data import SlavedAccountDataStore
 from synapse.replication.slave.storage.appservice import SlavedApplicationServiceStore
@@ -77,19 +63,6 @@ from synapse.replication.slave.storage.receipts import SlavedReceiptsStore
 from synapse.replication.slave.storage.registration import SlavedRegistrationStore
 from synapse.replication.slave.storage.room import RoomStore
 from synapse.replication.slave.storage.transactions import SlavedTransactionStore
-from synapse.replication.tcp.client import ReplicationDataHandler
-from synapse.replication.tcp.commands import ClearUserSyncsCommand
-from synapse.replication.tcp.streams import (
-    AccountDataStream,
-    DeviceListsStream,
-    GroupServerStream,
-    PresenceStream,
-    PushersStream,
-    PushRulesStream,
-    ReceiptsStream,
-    TagAccountDataStream,
-    ToDeviceStream,
-)
 from synapse.rest.admin import register_servlets_for_media_repo
 from synapse.rest.client.v1 import events, login, room
 from synapse.rest.client.v1.initial_sync import InitialSyncRestServlet
@@ -128,7 +101,7 @@ from synapse.rest.client.versions import VersionsRestServlet
 from synapse.rest.health import HealthResource
 from synapse.rest.key.v2 import KeyApiV2Resource
 from synapse.rest.synapse.client import build_synapse_client_resource_tree
-from synapse.server import HomeServer, cache_in_self
+from synapse.server import HomeServer
 from synapse.storage.databases.main.censor_events import CensorEventsStore
 from synapse.storage.databases.main.client_ips import ClientIpWorkerStore
 from synapse.storage.databases.main.e2e_room_keys import EndToEndRoomKeyStore
@@ -137,14 +110,11 @@ from synapse.storage.databases.main.metrics import ServerMetricsStore
 from synapse.storage.databases.main.monthly_active_users import (
     MonthlyActiveUsersWorkerStore,
 )
-from synapse.storage.databases.main.presence import UserPresenceState
 from synapse.storage.databases.main.search import SearchWorkerStore
 from synapse.storage.databases.main.stats import StatsStore
 from synapse.storage.databases.main.transactions import TransactionWorkerStore
 from synapse.storage.databases.main.ui_auth import UIAuthWorkerStore
 from synapse.storage.databases.main.user_directory import UserDirectoryStore
-from synapse.types import ReadReceipt
-from synapse.util.async_helpers import Linearizer
 from synapse.util.httpresourcetree import create_resource_tree
 from synapse.util.versionstring import get_version_string
 
@@ -264,214 +234,6 @@ class KeyUploadServlet(RestServlet):
             return 200, {"one_time_key_counts": result}
 
 
-class _NullContextManager(ContextManager[None]):
-    """A context manager which does nothing."""
-
-    def __exit__(self, exc_type, exc_val, exc_tb):
-        pass
-
-
-UPDATE_SYNCING_USERS_MS = 10 * 1000
-
-
-class GenericWorkerPresence(BasePresenceHandler):
-    def __init__(self, hs):
-        super().__init__(hs)
-        self.hs = hs
-        self.is_mine_id = hs.is_mine_id
-
-        self.presence_router = hs.get_presence_router()
-        self._presence_enabled = hs.config.use_presence
-
-        # The number of ongoing syncs on this process, by user id.
-        # Empty if _presence_enabled is false.
-        self._user_to_num_current_syncs = {}  # type: Dict[str, int]
-
-        self.notifier = hs.get_notifier()
-        self.instance_id = hs.get_instance_id()
-
-        # user_id -> last_sync_ms. Lists the users that have stopped syncing
-        # but we haven't notified the master of that yet
-        self.users_going_offline = {}
-
-        self._bump_active_client = ReplicationBumpPresenceActiveTime.make_client(hs)
-        self._set_state_client = ReplicationPresenceSetState.make_client(hs)
-
-        self._send_stop_syncing_loop = self.clock.looping_call(
-            self.send_stop_syncing, UPDATE_SYNCING_USERS_MS
-        )
-
-        self._busy_presence_enabled = hs.config.experimental.msc3026_enabled
-
-        hs.get_reactor().addSystemEventTrigger(
-            "before",
-            "shutdown",
-            run_as_background_process,
-            "generic_presence.on_shutdown",
-            self._on_shutdown,
-        )
-
-    def _on_shutdown(self):
-        if self._presence_enabled:
-            self.hs.get_tcp_replication().send_command(
-                ClearUserSyncsCommand(self.instance_id)
-            )
-
-    def send_user_sync(self, user_id, is_syncing, last_sync_ms):
-        if self._presence_enabled:
-            self.hs.get_tcp_replication().send_user_sync(
-                self.instance_id, user_id, is_syncing, last_sync_ms
-            )
-
-    def mark_as_coming_online(self, user_id):
-        """A user has started syncing. Send a UserSync to the master, unless they
-        had recently stopped syncing.
-
-        Args:
-            user_id (str)
-        """
-        going_offline = self.users_going_offline.pop(user_id, None)
-        if not going_offline:
-            # Safe to skip because we haven't yet told the master they were offline
-            self.send_user_sync(user_id, True, self.clock.time_msec())
-
-    def mark_as_going_offline(self, user_id):
-        """A user has stopped syncing. We wait before notifying the master as
-        its likely they'll come back soon. This allows us to avoid sending
-        a stopped syncing immediately followed by a started syncing notification
-        to the master
-
-        Args:
-            user_id (str)
-        """
-        self.users_going_offline[user_id] = self.clock.time_msec()
-
-    def send_stop_syncing(self):
-        """Check if there are any users who have stopped syncing a while ago
-        and haven't come back yet. If there are poke the master about them.
-        """
-        now = self.clock.time_msec()
-        for user_id, last_sync_ms in list(self.users_going_offline.items()):
-            if now - last_sync_ms > UPDATE_SYNCING_USERS_MS:
-                self.users_going_offline.pop(user_id, None)
-                self.send_user_sync(user_id, False, last_sync_ms)
-
-    async def user_syncing(
-        self, user_id: str, affect_presence: bool
-    ) -> ContextManager[None]:
-        """Record that a user is syncing.
-
-        Called by the sync and events servlets to record that a user has connected to
-        this worker and is waiting for some events.
-        """
-        if not affect_presence or not self._presence_enabled:
-            return _NullContextManager()
-
-        curr_sync = self._user_to_num_current_syncs.get(user_id, 0)
-        self._user_to_num_current_syncs[user_id] = curr_sync + 1
-
-        # If we went from no in flight sync to some, notify replication
-        if self._user_to_num_current_syncs[user_id] == 1:
-            self.mark_as_coming_online(user_id)
-
-        def _end():
-            # We check that the user_id is in user_to_num_current_syncs because
-            # user_to_num_current_syncs may have been cleared if we are
-            # shutting down.
-            if user_id in self._user_to_num_current_syncs:
-                self._user_to_num_current_syncs[user_id] -= 1
-
-                # If we went from one in flight sync to non, notify replication
-                if self._user_to_num_current_syncs[user_id] == 0:
-                    self.mark_as_going_offline(user_id)
-
-        @contextlib.contextmanager
-        def _user_syncing():
-            try:
-                yield
-            finally:
-                _end()
-
-        return _user_syncing()
-
-    async def notify_from_replication(self, states, stream_id):
-        parties = await get_interested_parties(self.store, self.presence_router, states)
-        room_ids_to_states, users_to_states = parties
-
-        self.notifier.on_new_event(
-            "presence_key",
-            stream_id,
-            rooms=room_ids_to_states.keys(),
-            users=users_to_states.keys(),
-        )
-
-    async def process_replication_rows(self, token, rows):
-        states = [
-            UserPresenceState(
-                row.user_id,
-                row.state,
-                row.last_active_ts,
-                row.last_federation_update_ts,
-                row.last_user_sync_ts,
-                row.status_msg,
-                row.currently_active,
-            )
-            for row in rows
-        ]
-
-        for state in states:
-            self.user_to_current_state[state.user_id] = state
-
-        stream_id = token
-        await self.notify_from_replication(states, stream_id)
-
-    def get_currently_syncing_users_for_replication(self) -> Iterable[str]:
-        return [
-            user_id
-            for user_id, count in self._user_to_num_current_syncs.items()
-            if count > 0
-        ]
-
-    async def set_state(self, target_user, state, ignore_status_msg=False):
-        """Set the presence state of the user."""
-        presence = state["presence"]
-
-        valid_presence = (
-            PresenceState.ONLINE,
-            PresenceState.UNAVAILABLE,
-            PresenceState.OFFLINE,
-            PresenceState.BUSY,
-        )
-
-        if presence not in valid_presence or (
-            presence == PresenceState.BUSY and not self._busy_presence_enabled
-        ):
-            raise SynapseError(400, "Invalid presence state")
-
-        user_id = target_user.to_string()
-
-        # If presence is disabled, no-op
-        if not self.hs.config.use_presence:
-            return
-
-        # Proxy request to master
-        await self._set_state_client(
-            user_id=user_id, state=state, ignore_status_msg=ignore_status_msg
-        )
-
-    async def bump_presence_active_time(self, user):
-        """We've seen the user do something that indicates they're interacting
-        with the app.
-        """
-        # If presence is disabled, no-op
-        if not self.hs.config.use_presence:
-            return
-
-        # Proxy request to master
-        user_id = user.to_string()
-        await self._bump_active_client(user_id=user_id)
-
-
 class GenericWorkerSlavedStore(
     # FIXME(#3714): We need to add UserDirectoryStore as we write directly
     # rather than going via the correct worker.
@@ -657,234 +419,6 @@ class GenericWorkerServer(HomeServer):
 
         self.get_tcp_replication().start_replication(self)
 
-    @cache_in_self
-    def get_replication_data_handler(self):
-        return GenericWorkerReplicationHandler(self)
-
-    @cache_in_self
-    def get_presence_handler(self):
-        return GenericWorkerPresence(self)
-
-
-class GenericWorkerReplicationHandler(ReplicationDataHandler):
-    def __init__(self, hs):
-        super().__init__(hs)
-
-        self.store = hs.get_datastore()
-        self.presence_handler = hs.get_presence_handler()  # type: GenericWorkerPresence
-        self.notifier = hs.get_notifier()
-
-        self.notify_pushers = hs.config.start_pushers
-        self.pusher_pool = hs.get_pusherpool()
-
-        self.send_handler = None  # type: Optional[FederationSenderHandler]
-        if hs.config.send_federation:
-            self.send_handler = FederationSenderHandler(hs)
-
-    async def on_rdata(self, stream_name, instance_name, token, rows):
-        await super().on_rdata(stream_name, instance_name, token, rows)
-        await self._process_and_notify(stream_name, instance_name, token, rows)
-
-    async def _process_and_notify(self, stream_name, instance_name, token, rows):
-        try:
-            if self.send_handler:
-                await self.send_handler.process_replication_rows(
-                    stream_name, token, rows
-                )
-
-            if stream_name == PushRulesStream.NAME:
-                self.notifier.on_new_event(
-                    "push_rules_key", token, users=[row.user_id for row in rows]
-                )
-            elif stream_name in (AccountDataStream.NAME, TagAccountDataStream.NAME):
-                self.notifier.on_new_event(
-                    "account_data_key", token, users=[row.user_id for row in rows]
-                )
-            elif stream_name == ReceiptsStream.NAME:
-                self.notifier.on_new_event(
-                    "receipt_key", token, rooms=[row.room_id for row in rows]
-                )
-                await self.pusher_pool.on_new_receipts(
-                    token, token, {row.room_id for row in rows}
-                )
-            elif stream_name == ToDeviceStream.NAME:
-                entities = [row.entity for row in rows if row.entity.startswith("@")]
-                if entities:
-                    self.notifier.on_new_event("to_device_key", token, users=entities)
-            elif stream_name == DeviceListsStream.NAME:
-                all_room_ids = set()  # type: Set[str]
-                for row in rows:
-                    if row.entity.startswith("@"):
-                        room_ids = await self.store.get_rooms_for_user(row.entity)
-                        all_room_ids.update(room_ids)
-                self.notifier.on_new_event("device_list_key", token, rooms=all_room_ids)
-            elif stream_name == PresenceStream.NAME:
-                await self.presence_handler.process_replication_rows(token, rows)
-            elif stream_name == GroupServerStream.NAME:
-                self.notifier.on_new_event(
-                    "groups_key", token, users=[row.user_id for row in rows]
-                )
-            elif stream_name == PushersStream.NAME:
-                for row in rows:
-                    if row.deleted:
-                        self.stop_pusher(row.user_id, row.app_id, row.pushkey)
-                    else:
-                        await self.start_pusher(row.user_id, row.app_id, row.pushkey)
-        except Exception:
-            logger.exception("Error processing replication")
-
-    async def on_position(self, stream_name: str, instance_name: str, token: int):
-        await super().on_position(stream_name, instance_name, token)
-        # Also call on_rdata to ensure that stream positions are properly reset.
-        await self.on_rdata(stream_name, instance_name, token, [])
-
-    def stop_pusher(self, user_id, app_id, pushkey):
-        if not self.notify_pushers:
-            return
-
-        key = "%s:%s" % (app_id, pushkey)
-        pushers_for_user = self.pusher_pool.pushers.get(user_id, {})
-        pusher = pushers_for_user.pop(key, None)
-        if pusher is None:
-            return
-        logger.info("Stopping pusher %r / %r", user_id, key)
-        pusher.on_stop()
-
-    async def start_pusher(self, user_id, app_id, pushkey):
-        if not self.notify_pushers:
-            return
-
-        key = "%s:%s" % (app_id, pushkey)
-        logger.info("Starting pusher %r / %r", user_id, key)
-        return await self.pusher_pool.start_pusher_by_id(app_id, pushkey, user_id)
-
-    def on_remote_server_up(self, server: str):
-        """Called when get a new REMOTE_SERVER_UP command."""
-
-        # Let's wake up the transaction queue for the server in case we have
-        # pending stuff to send to it.
-        if self.send_handler:
-            self.send_handler.wake_destination(server)
-
-
-class FederationSenderHandler:
-    """Processes the fedration replication stream
-
-    This class is only instantiate on the worker responsible for sending outbound
-    federation transactions. It receives rows from the replication stream and forwards
-    the appropriate entries to the FederationSender class.
-    """
-
-    def __init__(self, hs: GenericWorkerServer):
-        self.store = hs.get_datastore()
-        self._is_mine_id = hs.is_mine_id
-        self.federation_sender = hs.get_federation_sender()
-        self._hs = hs
-
-        # Stores the latest position in the federation stream we've gotten up
-        # to. This is always set before we use it.
-        self.federation_position = None
-
-        self._fed_position_linearizer = Linearizer(name="_fed_position_linearizer")
-
-    def wake_destination(self, server: str):
-        self.federation_sender.wake_destination(server)
-
-    async def process_replication_rows(self, stream_name, token, rows):
-        # The federation stream contains things that we want to send out, e.g.
-        # presence, typing, etc.
-        if stream_name == "federation":
-            send_queue.process_rows_for_federation(self.federation_sender, rows)
-            await self.update_token(token)
-
-        # ... and when new receipts happen
-        elif stream_name == ReceiptsStream.NAME:
-            await self._on_new_receipts(rows)
-
-        # ... as well as device updates and messages
-        elif stream_name == DeviceListsStream.NAME:
-            # The entities are either user IDs (starting with '@') whose devices
-            # have changed, or remote servers that we need to tell about
-            # changes.
-            hosts = {row.entity for row in rows if not row.entity.startswith("@")}
-            for host in hosts:
-                self.federation_sender.send_device_messages(host)
-
-        elif stream_name == ToDeviceStream.NAME:
-            # The to_device stream includes stuff to be pushed to both local
-            # clients and remote servers, so we ignore entities that start with
-            # '@' (since they'll be local users rather than destinations).
-            hosts = {row.entity for row in rows if not row.entity.startswith("@")}
-            for host in hosts:
-                self.federation_sender.send_device_messages(host)
-
-    async def _on_new_receipts(self, rows):
-        """
-        Args:
-            rows (Iterable[synapse.replication.tcp.streams.ReceiptsStream.ReceiptsStreamRow]):
-                new receipts to be processed
-        """
-        for receipt in rows:
-            # we only want to send on receipts for our own users
-            if not self._is_mine_id(receipt.user_id):
-                continue
-            receipt_info = ReadReceipt(
-                receipt.room_id,
-                receipt.receipt_type,
-                receipt.user_id,
-                [receipt.event_id],
-                receipt.data,
-            )
-            await self.federation_sender.send_read_receipt(receipt_info)
-
-    async def update_token(self, token):
-        """Update the record of where we have processed to in the federation stream.
-
-        Called after we have processed a an update received over replication. Sends
-        a FEDERATION_ACK back to the master, and stores the token that we have processed
-         in `federation_stream_position` so that we can restart where we left off.
-        """
-        self.federation_position = token
-
-        # We save and send the ACK to master asynchronously, so we don't block
-        # processing on persistence. We don't need to do this operation for
-        # every single RDATA we receive, we just need to do it periodically.
-
-        if self._fed_position_linearizer.is_queued(None):
-            # There is already a task queued up to save and send the token, so
-            # no need to queue up another task.
-            return
-
-        run_as_background_process("_save_and_send_ack", self._save_and_send_ack)
-
-    async def _save_and_send_ack(self):
-        """Save the current federation position in the database and send an ACK
-        to master with where we're up to.
-        """
-        try:
-            # We linearize here to ensure we don't have races updating the token
-            #
-            # XXX this appears to be redundant, since the ReplicationCommandHandler
-            # has a linearizer which ensures that we only process one line of
-            # replication data at a time. Should we remove it, or is it doing useful
-            # service for robustness? Or could we replace it with an assertion that
-            # we're not being re-entered?
-
-            with (await self._fed_position_linearizer.queue(None)):
-                # We persist and ack the same position, so we take a copy of it
-                # here as otherwise it can get modified from underneath us.
-                current_position = self.federation_position
-
-                await self.store.update_federation_out_pos(
-                    "federation", current_position
-                )
-
-                # We ACK this token over replication so that the master can drop
-                # its in memory queues
-                self._hs.get_tcp_replication().send_federation_ack(current_position)
-        except Exception:
-            logger.exception("Error updating federation stream position")
-
 
 def start(config_options):
     try:

+ 246 - 0
synapse/handlers/presence.py

@@ -22,6 +22,7 @@ The methods that define policy are:
     - should_notify
 """
 import abc
+import contextlib
 import logging
 from contextlib import contextmanager
 from typing import (
@@ -48,6 +49,11 @@ from synapse.logging.context import run_in_background
 from synapse.logging.utils import log_function
 from synapse.metrics import LaterGauge
 from synapse.metrics.background_process_metrics import run_as_background_process
+from synapse.replication.http.presence import (
+    ReplicationBumpPresenceActiveTime,
+    ReplicationPresenceSetState,
+)
+from synapse.replication.tcp.commands import ClearUserSyncsCommand
 from synapse.state import StateHandler
 from synapse.storage.databases.main import DataStore
 from synapse.types import Collection, JsonDict, UserID, get_domain_from_id
@@ -104,6 +110,10 @@ FEDERATION_PING_INTERVAL = 25 * 60 * 1000
 # are dead.
 EXTERNAL_PROCESS_EXPIRY = 5 * 60 * 1000
 
+# Delay before a worker tells the presence handler that a user has stopped
+# syncing.
+UPDATE_SYNCING_USERS_MS = 10 * 1000
+
 assert LAST_ACTIVE_GRANULARITY < IDLE_TIMER
 
 
@@ -208,6 +218,242 @@ class BasePresenceHandler(abc.ABC):
         with the app.
         """
 
+    async def update_external_syncs_row(
+        self, process_id, user_id, is_syncing, sync_time_msec
+    ):
+        """Update the syncing users for an external process as a delta.
+
+        This is a no-op when presence is handled by a different worker.
+
+        Args:
+            process_id (str): An identifier for the process the users are
+                syncing against. This allows synapse to process updates
+                as user start and stop syncing against a given process.
+            user_id (str): The user who has started or stopped syncing
+            is_syncing (bool): Whether or not the user is now syncing
+            sync_time_msec(int): Time in ms when the user was last syncing
+        """
+        pass
+
+    async def update_external_syncs_clear(self, process_id):
+        """Marks all users that had been marked as syncing by a given process
+        as offline.
+
+        Used when the process has stopped/disappeared.
+
+        This is a no-op when presence is handled by a different worker.
+        """
+        pass
+
+    async def process_replication_rows(self, token, rows):
+        """Process presence stream rows received over replication."""
+        pass
+
+
+class _NullContextManager(ContextManager[None]):
+    """A context manager which does nothing."""
+
+    def __exit__(self, exc_type, exc_val, exc_tb):
+        pass
+
+
+class WorkerPresenceHandler(BasePresenceHandler):
+    def __init__(self, hs):
+        super().__init__(hs)
+        self.hs = hs
+        self.is_mine_id = hs.is_mine_id
+
+        self.presence_router = hs.get_presence_router()
+        self._presence_enabled = hs.config.use_presence
+
+        # The number of ongoing syncs on this process, by user id.
+        # Empty if _presence_enabled is false.
+        self._user_to_num_current_syncs = {}  # type: Dict[str, int]
+
+        self.notifier = hs.get_notifier()
+        self.instance_id = hs.get_instance_id()
+
+        # user_id -> last_sync_ms. Lists the users that have stopped syncing
+        # but we haven't notified the master of that yet
+        self.users_going_offline = {}
+
+        self._bump_active_client = ReplicationBumpPresenceActiveTime.make_client(hs)
+        self._set_state_client = ReplicationPresenceSetState.make_client(hs)
+
+        self._send_stop_syncing_loop = self.clock.looping_call(
+            self.send_stop_syncing, UPDATE_SYNCING_USERS_MS
+        )
+
+        self._busy_presence_enabled = hs.config.experimental.msc3026_enabled
+
+        hs.get_reactor().addSystemEventTrigger(
+            "before",
+            "shutdown",
+            run_as_background_process,
+            "generic_presence.on_shutdown",
+            self._on_shutdown,
+        )
+
+    def _on_shutdown(self):
+        if self._presence_enabled:
+            self.hs.get_tcp_replication().send_command(
+                ClearUserSyncsCommand(self.instance_id)
+            )
+
+    def send_user_sync(self, user_id, is_syncing, last_sync_ms):
+        if self._presence_enabled:
+            self.hs.get_tcp_replication().send_user_sync(
+                self.instance_id, user_id, is_syncing, last_sync_ms
+            )
+
+    def mark_as_coming_online(self, user_id):
+        """A user has started syncing. Send a UserSync to the master, unless they
+        had recently stopped syncing.
+
+        Args:
+            user_id (str)
+        """
+        going_offline = self.users_going_offline.pop(user_id, None)
+        if not going_offline:
+            # Safe to skip because we haven't yet told the master they were offline
+            self.send_user_sync(user_id, True, self.clock.time_msec())
+
+    def mark_as_going_offline(self, user_id):
+        """A user has stopped syncing. We wait before notifying the master as
+        its likely they'll come back soon. This allows us to avoid sending
+        a stopped syncing immediately followed by a started syncing notification
+        to the master
+
+        Args:
+            user_id (str)
+        """
+        self.users_going_offline[user_id] = self.clock.time_msec()
+
+    def send_stop_syncing(self):
+        """Check if there are any users who have stopped syncing a while ago
+        and haven't come back yet. If there are poke the master about them.
+        """
+        now = self.clock.time_msec()
+        for user_id, last_sync_ms in list(self.users_going_offline.items()):
+            if now - last_sync_ms > UPDATE_SYNCING_USERS_MS:
+                self.users_going_offline.pop(user_id, None)
+                self.send_user_sync(user_id, False, last_sync_ms)
+
+    async def user_syncing(
+        self, user_id: str, affect_presence: bool
+    ) -> ContextManager[None]:
+        """Record that a user is syncing.
+
+        Called by the sync and events servlets to record that a user has connected to
+        this worker and is waiting for some events.
+        """
+        if not affect_presence or not self._presence_enabled:
+            return _NullContextManager()
+
+        curr_sync = self._user_to_num_current_syncs.get(user_id, 0)
+        self._user_to_num_current_syncs[user_id] = curr_sync + 1
+
+        # If we went from no in flight sync to some, notify replication
+        if self._user_to_num_current_syncs[user_id] == 1:
+            self.mark_as_coming_online(user_id)
+
+        def _end():
+            # We check that the user_id is in user_to_num_current_syncs because
+            # user_to_num_current_syncs may have been cleared if we are
+            # shutting down.
+            if user_id in self._user_to_num_current_syncs:
+                self._user_to_num_current_syncs[user_id] -= 1
+
+                # If we went from one in flight sync to non, notify replication
+                if self._user_to_num_current_syncs[user_id] == 0:
+                    self.mark_as_going_offline(user_id)
+
+        @contextlib.contextmanager
+        def _user_syncing():
+            try:
+                yield
+            finally:
+                _end()
+
+        return _user_syncing()
+
+    async def notify_from_replication(self, states, stream_id):
+        parties = await get_interested_parties(self.store, self.presence_router, states)
+        room_ids_to_states, users_to_states = parties
+
+        self.notifier.on_new_event(
+            "presence_key",
+            stream_id,
+            rooms=room_ids_to_states.keys(),
+            users=users_to_states.keys(),
+        )
+
+    async def process_replication_rows(self, token, rows):
+        states = [
+            UserPresenceState(
+                row.user_id,
+                row.state,
+                row.last_active_ts,
+                row.last_federation_update_ts,
+                row.last_user_sync_ts,
+                row.status_msg,
+                row.currently_active,
+            )
+            for row in rows
+        ]
+
+        for state in states:
+            self.user_to_current_state[state.user_id] = state
+
+        stream_id = token
+        await self.notify_from_replication(states, stream_id)
+
+    def get_currently_syncing_users_for_replication(self) -> Iterable[str]:
+        return [
+            user_id
+            for user_id, count in self._user_to_num_current_syncs.items()
+            if count > 0
+        ]
+
+    async def set_state(self, target_user, state, ignore_status_msg=False):
+        """Set the presence state of the user."""
+        presence = state["presence"]
+
+        valid_presence = (
+            PresenceState.ONLINE,
+            PresenceState.UNAVAILABLE,
+            PresenceState.OFFLINE,
+            PresenceState.BUSY,
+        )
+
+        if presence not in valid_presence or (
+            presence == PresenceState.BUSY and not self._busy_presence_enabled
+        ):
+            raise SynapseError(400, "Invalid presence state")
+
+        user_id = target_user.to_string()
+
+        # If presence is disabled, no-op
+        if not self.hs.config.use_presence:
+            return
+
+        # Proxy request to master
+        await self._set_state_client(
+            user_id=user_id, state=state, ignore_status_msg=ignore_status_msg
+        )
+
+    async def bump_presence_active_time(self, user):
+        """We've seen the user do something that indicates they're interacting
+        with the app.
+        """
+        # If presence is disabled, no-op
+        if not self.hs.config.use_presence:
+            return
+
+        # Proxy request to master
+        user_id = user.to_string()
+        await self._bump_active_client(user_id=user_id)
+
 
 class PresenceHandler(BasePresenceHandler):
     def __init__(self, hs: "HomeServer"):

+ 224 - 7
synapse/replication/tcp/client.py

@@ -14,22 +14,36 @@
 """A replication client for use by synapse workers.
 """
 import logging
-from typing import TYPE_CHECKING, Dict, List, Tuple
+from typing import TYPE_CHECKING, Dict, List, Optional, Set, Tuple
 
 from twisted.internet.defer import Deferred
 from twisted.internet.protocol import ReconnectingClientFactory
 
 from synapse.api.constants import EventTypes
+from synapse.federation import send_queue
+from synapse.federation.sender import FederationSender
 from synapse.logging.context import PreserveLoggingContext, make_deferred_yieldable
+from synapse.metrics.background_process_metrics import run_as_background_process
 from synapse.replication.tcp.protocol import ClientReplicationStreamProtocol
-from synapse.replication.tcp.streams import TypingStream
+from synapse.replication.tcp.streams import (
+    AccountDataStream,
+    DeviceListsStream,
+    GroupServerStream,
+    PresenceStream,
+    PushersStream,
+    PushRulesStream,
+    ReceiptsStream,
+    TagAccountDataStream,
+    ToDeviceStream,
+    TypingStream,
+)
 from synapse.replication.tcp.streams.events import (
     EventsStream,
     EventsStreamEventRow,
     EventsStreamRow,
 )
-from synapse.types import PersistedEventPosition, UserID
-from synapse.util.async_helpers import timeout_deferred
+from synapse.types import PersistedEventPosition, ReadReceipt, UserID
+from synapse.util.async_helpers import Linearizer, timeout_deferred
 from synapse.util.metrics import Measure
 
 if TYPE_CHECKING:
@@ -105,6 +119,14 @@ class ReplicationDataHandler:
         self._instance_name = hs.get_instance_name()
         self._typing_handler = hs.get_typing_handler()
 
+        self._notify_pushers = hs.config.start_pushers
+        self._pusher_pool = hs.get_pusherpool()
+        self._presence_handler = hs.get_presence_handler()
+
+        self.send_handler = None  # type: Optional[FederationSenderHandler]
+        if hs.should_send_federation():
+            self.send_handler = FederationSenderHandler(hs)
+
         # Map from stream to list of deferreds waiting for the stream to
         # arrive at a particular position. The lists are sorted by stream position.
         self._streams_to_waiters = {}  # type: Dict[str, List[Tuple[int, Deferred]]]
@@ -125,13 +147,53 @@ class ReplicationDataHandler:
         """
         self.store.process_replication_rows(stream_name, instance_name, token, rows)
 
+        if self.send_handler:
+            await self.send_handler.process_replication_rows(stream_name, token, rows)
+
         if stream_name == TypingStream.NAME:
             self._typing_handler.process_replication_rows(token, rows)
             self.notifier.on_new_event(
                 "typing_key", token, rooms=[row.room_id for row in rows]
             )
-
-        if stream_name == EventsStream.NAME:
+        elif stream_name == PushRulesStream.NAME:
+            self.notifier.on_new_event(
+                "push_rules_key", token, users=[row.user_id for row in rows]
+            )
+        elif stream_name in (AccountDataStream.NAME, TagAccountDataStream.NAME):
+            self.notifier.on_new_event(
+                "account_data_key", token, users=[row.user_id for row in rows]
+            )
+        elif stream_name == ReceiptsStream.NAME:
+            self.notifier.on_new_event(
+                "receipt_key", token, rooms=[row.room_id for row in rows]
+            )
+            await self._pusher_pool.on_new_receipts(
+                token, token, {row.room_id for row in rows}
+            )
+        elif stream_name == ToDeviceStream.NAME:
+            entities = [row.entity for row in rows if row.entity.startswith("@")]
+            if entities:
+                self.notifier.on_new_event("to_device_key", token, users=entities)
+        elif stream_name == DeviceListsStream.NAME:
+            all_room_ids = set()  # type: Set[str]
+            for row in rows:
+                if row.entity.startswith("@"):
+                    room_ids = await self.store.get_rooms_for_user(row.entity)
+                    all_room_ids.update(room_ids)
+            self.notifier.on_new_event("device_list_key", token, rooms=all_room_ids)
+        elif stream_name == GroupServerStream.NAME:
+            self.notifier.on_new_event(
+                "groups_key", token, users=[row.user_id for row in rows]
+            )
+        elif stream_name == PushersStream.NAME:
+            for row in rows:
+                if row.deleted:
+                    self.stop_pusher(row.user_id, row.app_id, row.pushkey)
+                else:
+                    await self.start_pusher(row.user_id, row.app_id, row.pushkey)
+        elif stream_name == PresenceStream.NAME:
+            await self._presence_handler.process_replication_rows(token, rows)
+        elif stream_name == EventsStream.NAME:
             # We shouldn't get multiple rows per token for events stream, so
             # we don't need to optimise this for multiple rows.
             for row in rows:
@@ -190,7 +252,7 @@ class ReplicationDataHandler:
         waiting_list[:] = waiting_list[index_of_first_deferred_not_called:]
 
     async def on_position(self, stream_name: str, instance_name: str, token: int):
-        self.store.process_replication_rows(stream_name, instance_name, token, [])
+        await self.on_rdata(stream_name, instance_name, token, [])
 
         # We poke the generic "replication" notifier to wake anything up that
         # may be streaming.
@@ -199,6 +261,11 @@ class ReplicationDataHandler:
     def on_remote_server_up(self, server: str):
         """Called when get a new REMOTE_SERVER_UP command."""
 
+        # Let's wake up the transaction queue for the server in case we have
+        # pending stuff to send to it.
+        if self.send_handler:
+            self.send_handler.wake_destination(server)
+
     async def wait_for_stream_position(
         self, instance_name: str, stream_name: str, position: int
     ):
@@ -235,3 +302,153 @@ class ReplicationDataHandler:
             logger.info(
                 "Finished waiting for repl stream %r to reach %s", stream_name, position
             )
+
+    def stop_pusher(self, user_id, app_id, pushkey):
+        if not self._notify_pushers:
+            return
+
+        key = "%s:%s" % (app_id, pushkey)
+        pushers_for_user = self._pusher_pool.pushers.get(user_id, {})
+        pusher = pushers_for_user.pop(key, None)
+        if pusher is None:
+            return
+        logger.info("Stopping pusher %r / %r", user_id, key)
+        pusher.on_stop()
+
+    async def start_pusher(self, user_id, app_id, pushkey):
+        if not self._notify_pushers:
+            return
+
+        key = "%s:%s" % (app_id, pushkey)
+        logger.info("Starting pusher %r / %r", user_id, key)
+        return await self._pusher_pool.start_pusher_by_id(app_id, pushkey, user_id)
+
+
+class FederationSenderHandler:
+    """Processes the fedration replication stream
+
+    This class is only instantiate on the worker responsible for sending outbound
+    federation transactions. It receives rows from the replication stream and forwards
+    the appropriate entries to the FederationSender class.
+    """
+
+    def __init__(self, hs: "HomeServer"):
+        assert hs.should_send_federation()
+
+        self.store = hs.get_datastore()
+        self._is_mine_id = hs.is_mine_id
+        self._hs = hs
+
+        # We need to make a temporary value to ensure that mypy picks up the
+        # right type. We know we should have a federation sender instance since
+        # `should_send_federation` is True.
+        sender = hs.get_federation_sender()
+        assert isinstance(sender, FederationSender)
+        self.federation_sender = sender
+
+        # Stores the latest position in the federation stream we've gotten up
+        # to. This is always set before we use it.
+        self.federation_position = None  # type: Optional[int]
+
+        self._fed_position_linearizer = Linearizer(name="_fed_position_linearizer")
+
+    def wake_destination(self, server: str):
+        self.federation_sender.wake_destination(server)
+
+    async def process_replication_rows(self, stream_name, token, rows):
+        # The federation stream contains things that we want to send out, e.g.
+        # presence, typing, etc.
+        if stream_name == "federation":
+            send_queue.process_rows_for_federation(self.federation_sender, rows)
+            await self.update_token(token)
+
+        # ... and when new receipts happen
+        elif stream_name == ReceiptsStream.NAME:
+            await self._on_new_receipts(rows)
+
+        # ... as well as device updates and messages
+        elif stream_name == DeviceListsStream.NAME:
+            # The entities are either user IDs (starting with '@') whose devices
+            # have changed, or remote servers that we need to tell about
+            # changes.
+            hosts = {row.entity for row in rows if not row.entity.startswith("@")}
+            for host in hosts:
+                self.federation_sender.send_device_messages(host)
+
+        elif stream_name == ToDeviceStream.NAME:
+            # The to_device stream includes stuff to be pushed to both local
+            # clients and remote servers, so we ignore entities that start with
+            # '@' (since they'll be local users rather than destinations).
+            hosts = {row.entity for row in rows if not row.entity.startswith("@")}
+            for host in hosts:
+                self.federation_sender.send_device_messages(host)
+
+    async def _on_new_receipts(self, rows):
+        """
+        Args:
+            rows (Iterable[synapse.replication.tcp.streams.ReceiptsStream.ReceiptsStreamRow]):
+                new receipts to be processed
+        """
+        for receipt in rows:
+            # we only want to send on receipts for our own users
+            if not self._is_mine_id(receipt.user_id):
+                continue
+            receipt_info = ReadReceipt(
+                receipt.room_id,
+                receipt.receipt_type,
+                receipt.user_id,
+                [receipt.event_id],
+                receipt.data,
+            )
+            await self.federation_sender.send_read_receipt(receipt_info)
+
+    async def update_token(self, token):
+        """Update the record of where we have processed to in the federation stream.
+
+        Called after we have processed a an update received over replication. Sends
+        a FEDERATION_ACK back to the master, and stores the token that we have processed
+         in `federation_stream_position` so that we can restart where we left off.
+        """
+        self.federation_position = token
+
+        # We save and send the ACK to master asynchronously, so we don't block
+        # processing on persistence. We don't need to do this operation for
+        # every single RDATA we receive, we just need to do it periodically.
+
+        if self._fed_position_linearizer.is_queued(None):
+            # There is already a task queued up to save and send the token, so
+            # no need to queue up another task.
+            return
+
+        run_as_background_process("_save_and_send_ack", self._save_and_send_ack)
+
+    async def _save_and_send_ack(self):
+        """Save the current federation position in the database and send an ACK
+        to master with where we're up to.
+        """
+        # We should only be calling this once we've got a token.
+        assert self.federation_position is not None
+
+        try:
+            # We linearize here to ensure we don't have races updating the token
+            #
+            # XXX this appears to be redundant, since the ReplicationCommandHandler
+            # has a linearizer which ensures that we only process one line of
+            # replication data at a time. Should we remove it, or is it doing useful
+            # service for robustness? Or could we replace it with an assertion that
+            # we're not being re-entered?
+
+            with (await self._fed_position_linearizer.queue(None)):
+                # We persist and ack the same position, so we take a copy of it
+                # here as otherwise it can get modified from underneath us.
+                current_position = self.federation_position
+
+                await self.store.update_federation_out_pos(
+                    "federation", current_position
+                )
+
+                # We ACK this token over replication so that the master can drop
+                # its in memory queues
+                self._hs.get_tcp_replication().send_federation_ack(current_position)
+        except Exception:
+            logger.exception("Error updating federation stream position")

+ 10 - 3
synapse/server.py

@@ -85,7 +85,11 @@ from synapse.handlers.initial_sync import InitialSyncHandler
 from synapse.handlers.message import EventCreationHandler, MessageHandler
 from synapse.handlers.pagination import PaginationHandler
 from synapse.handlers.password_policy import PasswordPolicyHandler
-from synapse.handlers.presence import PresenceHandler
+from synapse.handlers.presence import (
+    BasePresenceHandler,
+    PresenceHandler,
+    WorkerPresenceHandler,
+)
 from synapse.handlers.profile import ProfileHandler
 from synapse.handlers.read_marker import ReadMarkerHandler
 from synapse.handlers.receipts import ReceiptsHandler
@@ -415,8 +419,11 @@ class HomeServer(metaclass=abc.ABCMeta):
         return StateResolutionHandler(self)
 
     @cache_in_self
-    def get_presence_handler(self) -> PresenceHandler:
-        return PresenceHandler(self)
+    def get_presence_handler(self) -> BasePresenceHandler:
+        if self.config.worker_app:
+            return WorkerPresenceHandler(self)
+        else:
+            return PresenceHandler(self)
 
     @cache_in_self
     def get_typing_writer_handler(self) -> TypingWriterHandler:

+ 3 - 5
tests/replication/_base.py

@@ -21,13 +21,11 @@ from twisted.web.http import HTTPChannel
 from twisted.web.resource import Resource
 from twisted.web.server import Request, Site
 
-from synapse.app.generic_worker import (
-    GenericWorkerReplicationHandler,
-    GenericWorkerServer,
-)
+from synapse.app.generic_worker import GenericWorkerServer
 from synapse.http.server import JsonResource
 from synapse.http.site import SynapseRequest, SynapseSite
 from synapse.replication.http import ReplicationRestResource
+from synapse.replication.tcp.client import ReplicationDataHandler
 from synapse.replication.tcp.handler import ReplicationCommandHandler
 from synapse.replication.tcp.protocol import ClientReplicationStreamProtocol
 from synapse.replication.tcp.resource import (
@@ -431,7 +429,7 @@ class BaseMultiWorkerStreamTestCase(unittest.HomeserverTestCase):
             server_protocol.makeConnection(server_to_client_transport)
 
 
-class TestReplicationDataHandler(GenericWorkerReplicationHandler):
+class TestReplicationDataHandler(ReplicationDataHandler):
     """Drop-in for ReplicationDataHandler which just collects RDATA rows"""
 
     def __init__(self, hs: HomeServer):