handler.py 30 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796
  1. # Copyright 2017 Vector Creations Ltd
  2. # Copyright 2020, 2022 The Matrix.org Foundation C.I.C.
  3. #
  4. # Licensed under the Apache License, Version 2.0 (the "License");
  5. # you may not use this file except in compliance with the License.
  6. # You may obtain a copy of the License at
  7. #
  8. # http://www.apache.org/licenses/LICENSE-2.0
  9. #
  10. # Unless required by applicable law or agreed to in writing, software
  11. # distributed under the License is distributed on an "AS IS" BASIS,
  12. # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  13. # See the License for the specific language governing permissions and
  14. # limitations under the License.
  15. import logging
  16. from typing import (
  17. TYPE_CHECKING,
  18. Any,
  19. Awaitable,
  20. Dict,
  21. Iterable,
  22. Iterator,
  23. List,
  24. Optional,
  25. Set,
  26. Tuple,
  27. TypeVar,
  28. Union,
  29. )
  30. from prometheus_client import Counter
  31. from typing_extensions import Deque
  32. from twisted.internet.protocol import ReconnectingClientFactory
  33. from synapse.metrics import LaterGauge
  34. from synapse.metrics.background_process_metrics import run_as_background_process
  35. from synapse.replication.tcp.commands import (
  36. ClearUserSyncsCommand,
  37. Command,
  38. FederationAckCommand,
  39. PositionCommand,
  40. RdataCommand,
  41. RemoteServerUpCommand,
  42. ReplicateCommand,
  43. UserIpCommand,
  44. UserSyncCommand,
  45. )
  46. from synapse.replication.tcp.context import ClientContextFactory
  47. from synapse.replication.tcp.protocol import IReplicationConnection
  48. from synapse.replication.tcp.streams import (
  49. STREAMS_MAP,
  50. AccountDataStream,
  51. BackfillStream,
  52. CachesStream,
  53. EventsStream,
  54. FederationStream,
  55. PresenceFederationStream,
  56. PresenceStream,
  57. ReceiptsStream,
  58. Stream,
  59. ToDeviceStream,
  60. TypingStream,
  61. )
  62. if TYPE_CHECKING:
  63. from synapse.server import HomeServer
  64. logger = logging.getLogger(__name__)
  65. # number of updates received for each RDATA stream
  66. inbound_rdata_count = Counter(
  67. "synapse_replication_tcp_protocol_inbound_rdata_count", "", ["stream_name"]
  68. )
  69. user_sync_counter = Counter("synapse_replication_tcp_resource_user_sync", "")
  70. federation_ack_counter = Counter("synapse_replication_tcp_resource_federation_ack", "")
  71. remove_pusher_counter = Counter("synapse_replication_tcp_resource_remove_pusher", "")
  72. user_ip_cache_counter = Counter("synapse_replication_tcp_resource_user_ip_cache", "")
  73. # the type of the entries in _command_queues_by_stream
  74. _StreamCommandQueue = Deque[
  75. Tuple[Union[RdataCommand, PositionCommand], IReplicationConnection]
  76. ]
  77. class ReplicationCommandHandler:
  78. """Handles incoming commands from replication as well as sending commands
  79. back out to connections.
  80. """
  81. def __init__(self, hs: "HomeServer"):
  82. self._replication_data_handler = hs.get_replication_data_handler()
  83. self._presence_handler = hs.get_presence_handler()
  84. self._store = hs.get_datastores().main
  85. self._notifier = hs.get_notifier()
  86. self._clock = hs.get_clock()
  87. self._instance_id = hs.get_instance_id()
  88. self._instance_name = hs.get_instance_name()
  89. # Additional Redis channel suffixes to subscribe to.
  90. self._channels_to_subscribe_to: List[str] = []
  91. self._is_presence_writer = (
  92. hs.get_instance_name() in hs.config.worker.writers.presence
  93. )
  94. self._streams: Dict[str, Stream] = {
  95. stream.NAME: stream(hs) for stream in STREAMS_MAP.values()
  96. }
  97. # List of streams that this instance is the source of
  98. self._streams_to_replicate: List[Stream] = []
  99. for stream in self._streams.values():
  100. if hs.config.redis.redis_enabled and stream.NAME == CachesStream.NAME:
  101. # All workers can write to the cache invalidation stream when
  102. # using redis.
  103. self._streams_to_replicate.append(stream)
  104. continue
  105. if isinstance(stream, (EventsStream, BackfillStream)):
  106. # Only add EventStream and BackfillStream as a source on the
  107. # instance in charge of event persistence.
  108. if hs.get_instance_name() in hs.config.worker.writers.events:
  109. self._streams_to_replicate.append(stream)
  110. continue
  111. if isinstance(stream, ToDeviceStream):
  112. # Only add ToDeviceStream as a source on instances in charge of
  113. # sending to device messages.
  114. if hs.get_instance_name() in hs.config.worker.writers.to_device:
  115. self._streams_to_replicate.append(stream)
  116. continue
  117. if isinstance(stream, TypingStream):
  118. # Only add TypingStream as a source on the instance in charge of
  119. # typing.
  120. if hs.get_instance_name() in hs.config.worker.writers.typing:
  121. self._streams_to_replicate.append(stream)
  122. continue
  123. if isinstance(stream, AccountDataStream):
  124. # Only add AccountDataStream and TagAccountDataStream as a source on the
  125. # instance in charge of account_data persistence.
  126. if hs.get_instance_name() in hs.config.worker.writers.account_data:
  127. self._streams_to_replicate.append(stream)
  128. continue
  129. if isinstance(stream, ReceiptsStream):
  130. # Only add ReceiptsStream as a source on the instance in charge of
  131. # receipts.
  132. if hs.get_instance_name() in hs.config.worker.writers.receipts:
  133. self._streams_to_replicate.append(stream)
  134. continue
  135. if isinstance(stream, (PresenceStream, PresenceFederationStream)):
  136. # Only add PresenceStream as a source on the instance in charge
  137. # of presence.
  138. if self._is_presence_writer:
  139. self._streams_to_replicate.append(stream)
  140. continue
  141. # Only add any other streams if we're on master.
  142. if hs.config.worker.worker_app is not None:
  143. continue
  144. if (
  145. stream.NAME == FederationStream.NAME
  146. and hs.config.worker.send_federation
  147. ):
  148. # We only support federation stream if federation sending
  149. # has been disabled on the master.
  150. continue
  151. self._streams_to_replicate.append(stream)
  152. # Map of stream name to batched updates. See RdataCommand for info on
  153. # how batching works.
  154. self._pending_batches: Dict[str, List[Any]] = {}
  155. # The factory used to create connections.
  156. self._factory: Optional[ReconnectingClientFactory] = None
  157. # The currently connected connections. (The list of places we need to send
  158. # outgoing replication commands to.)
  159. self._connections: List[IReplicationConnection] = []
  160. LaterGauge(
  161. "synapse_replication_tcp_resource_total_connections",
  162. "",
  163. [],
  164. lambda: len(self._connections),
  165. )
  166. # When POSITION or RDATA commands arrive, we stick them in a queue and process
  167. # them in order in a separate background process.
  168. # the streams which are currently being processed by _unsafe_process_queue
  169. self._processing_streams: Set[str] = set()
  170. # for each stream, a queue of commands that are awaiting processing, and the
  171. # connection that they arrived on.
  172. self._command_queues_by_stream = {
  173. stream_name: _StreamCommandQueue() for stream_name in self._streams
  174. }
  175. # For each connection, the incoming stream names that have received a POSITION
  176. # from that connection.
  177. self._streams_by_connection: Dict[IReplicationConnection, Set[str]] = {}
  178. LaterGauge(
  179. "synapse_replication_tcp_command_queue",
  180. "Number of inbound RDATA/POSITION commands queued for processing",
  181. ["stream_name"],
  182. lambda: {
  183. (stream_name,): len(queue)
  184. for stream_name, queue in self._command_queues_by_stream.items()
  185. },
  186. )
  187. self._is_master = hs.config.worker.worker_app is None
  188. self._federation_sender = None
  189. if self._is_master and not hs.config.worker.send_federation:
  190. self._federation_sender = hs.get_federation_sender()
  191. self._server_notices_sender = None
  192. if self._is_master:
  193. self._server_notices_sender = hs.get_server_notices_sender()
  194. if hs.config.redis.redis_enabled:
  195. # If we're using Redis, it's the background worker that should
  196. # receive USER_IP commands and store the relevant client IPs.
  197. self._should_insert_client_ips = hs.config.worker.run_background_tasks
  198. else:
  199. # If we're NOT using Redis, this must be handled by the master
  200. self._should_insert_client_ips = hs.get_instance_name() == "master"
  201. if self._is_master or self._should_insert_client_ips:
  202. self.subscribe_to_channel("USER_IP")
  203. def subscribe_to_channel(self, channel_name: str) -> None:
  204. """
  205. Indicates that we wish to subscribe to a Redis channel by name.
  206. (The name will later be prefixed with the server name; i.e. subscribing
  207. to the 'ABC' channel actually subscribes to 'example.com/ABC' Redis-side.)
  208. Raises:
  209. - If replication has already started, then it's too late to subscribe
  210. to new channels.
  211. """
  212. if self._factory is not None:
  213. # We don't allow subscribing after the fact to avoid the chance
  214. # of missing an important message because we didn't subscribe in time.
  215. raise RuntimeError(
  216. "Cannot subscribe to more channels after replication started."
  217. )
  218. if channel_name not in self._channels_to_subscribe_to:
  219. self._channels_to_subscribe_to.append(channel_name)
  220. def _add_command_to_stream_queue(
  221. self, conn: IReplicationConnection, cmd: Union[RdataCommand, PositionCommand]
  222. ) -> None:
  223. """Queue the given received command for processing
  224. Adds the given command to the per-stream queue, and processes the queue if
  225. necessary
  226. """
  227. stream_name = cmd.stream_name
  228. queue = self._command_queues_by_stream.get(stream_name)
  229. if queue is None:
  230. logger.error("Got %s for unknown stream: %s", cmd.NAME, stream_name)
  231. return
  232. queue.append((cmd, conn))
  233. # if we're already processing this stream, there's nothing more to do:
  234. # the new entry on the queue will get picked up in due course
  235. if stream_name in self._processing_streams:
  236. return
  237. # fire off a background process to start processing the queue.
  238. run_as_background_process(
  239. "process-replication-data", self._unsafe_process_queue, stream_name
  240. )
  241. async def _unsafe_process_queue(self, stream_name: str) -> None:
  242. """Processes the command queue for the given stream, until it is empty
  243. Does not check if there is already a thread processing the queue, hence "unsafe"
  244. """
  245. assert stream_name not in self._processing_streams
  246. self._processing_streams.add(stream_name)
  247. try:
  248. queue = self._command_queues_by_stream.get(stream_name)
  249. while queue:
  250. cmd, conn = queue.popleft()
  251. try:
  252. await self._process_command(cmd, conn, stream_name)
  253. except Exception:
  254. logger.exception("Failed to handle command %s", cmd)
  255. finally:
  256. self._processing_streams.discard(stream_name)
  257. async def _process_command(
  258. self,
  259. cmd: Union[PositionCommand, RdataCommand],
  260. conn: IReplicationConnection,
  261. stream_name: str,
  262. ) -> None:
  263. if isinstance(cmd, PositionCommand):
  264. await self._process_position(stream_name, conn, cmd)
  265. elif isinstance(cmd, RdataCommand):
  266. await self._process_rdata(stream_name, conn, cmd)
  267. else:
  268. # This shouldn't be possible
  269. raise Exception("Unrecognised command %s in stream queue", cmd.NAME)
  270. def start_replication(self, hs: "HomeServer") -> None:
  271. """Helper method to start replication."""
  272. from synapse.replication.tcp.redis import RedisDirectTcpReplicationClientFactory
  273. # First let's ensure that we have a ReplicationStreamer started.
  274. hs.get_replication_streamer()
  275. # We need two connections to redis, one for the subscription stream and
  276. # one to send commands to (as you can't send further redis commands to a
  277. # connection after SUBSCRIBE is called).
  278. # First create the connection for sending commands.
  279. outbound_redis_connection = hs.get_outbound_redis_connection()
  280. # Now create the factory/connection for the subscription stream.
  281. self._factory = RedisDirectTcpReplicationClientFactory(
  282. hs,
  283. outbound_redis_connection,
  284. channel_names=self._channels_to_subscribe_to,
  285. )
  286. reactor = hs.get_reactor()
  287. redis_config = hs.config.redis
  288. if hs.config.redis.redis_use_tls:
  289. ssl_context_factory = ClientContextFactory(hs.config.redis)
  290. reactor.connectSSL(
  291. redis_config.redis_host,
  292. redis_config.redis_port,
  293. self._factory,
  294. ssl_context_factory,
  295. timeout=30,
  296. bindAddress=None,
  297. )
  298. else:
  299. reactor.connectTCP(
  300. redis_config.redis_host,
  301. redis_config.redis_port,
  302. self._factory,
  303. timeout=30,
  304. bindAddress=None,
  305. )
  306. def get_streams(self) -> Dict[str, Stream]:
  307. """Get a map from stream name to all streams."""
  308. return self._streams
  309. def get_streams_to_replicate(self) -> List[Stream]:
  310. """Get a list of streams that this instances replicates."""
  311. return self._streams_to_replicate
  312. def on_REPLICATE(self, conn: IReplicationConnection, cmd: ReplicateCommand) -> None:
  313. self.send_positions_to_connection(conn)
  314. def send_positions_to_connection(self, conn: IReplicationConnection) -> None:
  315. """Send current position of all streams this process is source of to
  316. the connection.
  317. """
  318. # We respond with current position of all streams this instance
  319. # replicates.
  320. for stream in self.get_streams_to_replicate():
  321. # Note that we use the current token as the prev token here (rather
  322. # than stream.last_token), as we can't be sure that there have been
  323. # no rows written between last token and the current token (since we
  324. # might be racing with the replication sending bg process).
  325. current_token = stream.current_token(self._instance_name)
  326. self.send_command(
  327. PositionCommand(
  328. stream.NAME,
  329. self._instance_name,
  330. current_token,
  331. current_token,
  332. )
  333. )
  334. def on_USER_SYNC(
  335. self, conn: IReplicationConnection, cmd: UserSyncCommand
  336. ) -> Optional[Awaitable[None]]:
  337. user_sync_counter.inc()
  338. if self._is_presence_writer:
  339. return self._presence_handler.update_external_syncs_row(
  340. cmd.instance_id, cmd.user_id, cmd.is_syncing, cmd.last_sync_ms
  341. )
  342. else:
  343. return None
  344. def on_CLEAR_USER_SYNC(
  345. self, conn: IReplicationConnection, cmd: ClearUserSyncsCommand
  346. ) -> Optional[Awaitable[None]]:
  347. if self._is_presence_writer:
  348. return self._presence_handler.update_external_syncs_clear(cmd.instance_id)
  349. else:
  350. return None
  351. def on_FEDERATION_ACK(
  352. self, conn: IReplicationConnection, cmd: FederationAckCommand
  353. ) -> None:
  354. federation_ack_counter.inc()
  355. if self._federation_sender:
  356. self._federation_sender.federation_ack(cmd.instance_name, cmd.token)
  357. def on_USER_IP(
  358. self, conn: IReplicationConnection, cmd: UserIpCommand
  359. ) -> Optional[Awaitable[None]]:
  360. user_ip_cache_counter.inc()
  361. if self._is_master or self._should_insert_client_ips:
  362. # We make a point of only returning an awaitable if there's actually
  363. # something to do; on_USER_IP is not an async function, but
  364. # _handle_user_ip is.
  365. # If on_USER_IP returns an awaitable, it gets scheduled as a
  366. # background process (see `BaseReplicationStreamProtocol.handle_command`).
  367. return self._handle_user_ip(cmd)
  368. else:
  369. # Returning None when this process definitely has nothing to do
  370. # reduces the overhead of handling the USER_IP command, which is
  371. # currently broadcast to all workers regardless of utility.
  372. return None
  373. async def _handle_user_ip(self, cmd: UserIpCommand) -> None:
  374. """
  375. Handles a User IP, branching depending on whether we are the main process
  376. and/or the background worker.
  377. """
  378. if self._is_master:
  379. assert self._server_notices_sender is not None
  380. await self._server_notices_sender.on_user_ip(cmd.user_id)
  381. if self._should_insert_client_ips:
  382. await self._store.insert_client_ip(
  383. cmd.user_id,
  384. cmd.access_token,
  385. cmd.ip,
  386. cmd.user_agent,
  387. cmd.device_id,
  388. cmd.last_seen,
  389. )
  390. def on_RDATA(self, conn: IReplicationConnection, cmd: RdataCommand) -> None:
  391. if cmd.instance_name == self._instance_name:
  392. # Ignore RDATA that are just our own echoes
  393. return
  394. stream_name = cmd.stream_name
  395. inbound_rdata_count.labels(stream_name).inc()
  396. # We put the received command into a queue here for two reasons:
  397. # 1. so we don't try and concurrently handle multiple rows for the
  398. # same stream, and
  399. # 2. so we don't race with getting a POSITION command and fetching
  400. # missing RDATA.
  401. self._add_command_to_stream_queue(conn, cmd)
  402. async def _process_rdata(
  403. self, stream_name: str, conn: IReplicationConnection, cmd: RdataCommand
  404. ) -> None:
  405. """Process an RDATA command
  406. Called after the command has been popped off the queue of inbound commands
  407. """
  408. try:
  409. row = STREAMS_MAP[stream_name].parse_row(cmd.row)
  410. except Exception as e:
  411. raise Exception(
  412. "Failed to parse RDATA: %r %r" % (stream_name, cmd.row)
  413. ) from e
  414. # make sure that we've processed a POSITION for this stream *on this
  415. # connection*. (A POSITION on another connection is no good, as there
  416. # is no guarantee that we have seen all the intermediate updates.)
  417. sbc = self._streams_by_connection.get(conn)
  418. if not sbc or stream_name not in sbc:
  419. # Let's drop the row for now, on the assumption we'll receive a
  420. # `POSITION` soon and we'll catch up correctly then.
  421. logger.debug(
  422. "Discarding RDATA for unconnected stream %s -> %s",
  423. stream_name,
  424. cmd.token,
  425. )
  426. return
  427. if cmd.token is None:
  428. # I.e. this is part of a batch of updates for this stream (in
  429. # which case batch until we get an update for the stream with a non
  430. # None token).
  431. self._pending_batches.setdefault(stream_name, []).append(row)
  432. return
  433. # Check if this is the last of a batch of updates
  434. rows = self._pending_batches.pop(stream_name, [])
  435. rows.append(row)
  436. stream = self._streams[stream_name]
  437. # Find where we previously streamed up to.
  438. current_token = stream.current_token(cmd.instance_name)
  439. # Discard this data if this token is earlier than the current
  440. # position. Note that streams can be reset (in which case you
  441. # expect an earlier token), but that must be preceded by a
  442. # POSITION command.
  443. if cmd.token <= current_token:
  444. logger.debug(
  445. "Discarding RDATA from stream %s at position %s before previous position %s",
  446. stream_name,
  447. cmd.token,
  448. current_token,
  449. )
  450. else:
  451. await self.on_rdata(stream_name, cmd.instance_name, cmd.token, rows)
  452. async def on_rdata(
  453. self, stream_name: str, instance_name: str, token: int, rows: list
  454. ) -> None:
  455. """Called to handle a batch of replication data with a given stream token.
  456. Args:
  457. stream_name: name of the replication stream for this batch of rows
  458. instance_name: the instance that wrote the rows.
  459. token: stream token for this batch of rows
  460. rows: a list of Stream.ROW_TYPE objects as returned by
  461. Stream.parse_row.
  462. """
  463. logger.debug("Received rdata %s (%s) -> %s", stream_name, instance_name, token)
  464. await self._replication_data_handler.on_rdata(
  465. stream_name, instance_name, token, rows
  466. )
  467. def on_POSITION(self, conn: IReplicationConnection, cmd: PositionCommand) -> None:
  468. if cmd.instance_name == self._instance_name:
  469. # Ignore POSITION that are just our own echoes
  470. return
  471. logger.debug("Handling '%s %s'", cmd.NAME, cmd.to_line())
  472. self._add_command_to_stream_queue(conn, cmd)
  473. async def _process_position(
  474. self, stream_name: str, conn: IReplicationConnection, cmd: PositionCommand
  475. ) -> None:
  476. """Process a POSITION command
  477. Called after the command has been popped off the queue of inbound commands
  478. """
  479. stream = self._streams[stream_name]
  480. # We're about to go and catch up with the stream, so remove from set
  481. # of connected streams.
  482. for streams in self._streams_by_connection.values():
  483. streams.discard(stream_name)
  484. # We clear the pending batches for the stream as the fetching of the
  485. # missing updates below will fetch all rows in the batch.
  486. self._pending_batches.pop(stream_name, [])
  487. # Find where we previously streamed up to.
  488. current_token = stream.current_token(cmd.instance_name)
  489. # If the position token matches our current token then we're up to
  490. # date and there's nothing to do. Otherwise, fetch all updates
  491. # between then and now.
  492. missing_updates = cmd.prev_token != current_token
  493. while missing_updates:
  494. # Note: There may very well not be any new updates, but we check to
  495. # make sure. This can particularly happen for the event stream where
  496. # event persisters continuously send `POSITION`. See `resource.py`
  497. # for why this can happen.
  498. logger.info(
  499. "Fetching replication rows for '%s' between %i and %i",
  500. stream_name,
  501. current_token,
  502. cmd.new_token,
  503. )
  504. (updates, current_token, missing_updates) = await stream.get_updates_since(
  505. cmd.instance_name, current_token, cmd.new_token
  506. )
  507. # TODO: add some tests for this
  508. # Some streams return multiple rows with the same stream IDs,
  509. # which need to be processed in batches.
  510. for token, rows in _batch_updates(updates):
  511. await self.on_rdata(
  512. stream_name,
  513. cmd.instance_name,
  514. token,
  515. [stream.parse_row(row) for row in rows],
  516. )
  517. logger.info("Caught up with stream '%s' to %i", stream_name, cmd.new_token)
  518. # We've now caught up to position sent to us, notify handler.
  519. await self._replication_data_handler.on_position(
  520. cmd.stream_name, cmd.instance_name, cmd.new_token
  521. )
  522. self._streams_by_connection.setdefault(conn, set()).add(stream_name)
  523. def on_REMOTE_SERVER_UP(
  524. self, conn: IReplicationConnection, cmd: RemoteServerUpCommand
  525. ) -> None:
  526. """Called when get a new REMOTE_SERVER_UP command."""
  527. self._replication_data_handler.on_remote_server_up(cmd.data)
  528. self._notifier.notify_remote_server_up(cmd.data)
  529. def new_connection(self, connection: IReplicationConnection) -> None:
  530. """Called when we have a new connection."""
  531. self._connections.append(connection)
  532. # If we are connected to replication as a client (rather than a server)
  533. # we need to reset the reconnection delay on the client factory (which
  534. # is used to do exponential back off when the connection drops).
  535. #
  536. # Ideally we would reset the delay when we've "fully established" the
  537. # connection (for some definition thereof) to stop us from tightlooping
  538. # on reconnection if something fails after this point and we drop the
  539. # connection. Unfortunately, we don't really have a better definition of
  540. # "fully established" than the connection being established.
  541. if self._factory:
  542. self._factory.resetDelay()
  543. # Tell the other end if we have any users currently syncing.
  544. currently_syncing = (
  545. self._presence_handler.get_currently_syncing_users_for_replication()
  546. )
  547. now = self._clock.time_msec()
  548. for user_id in currently_syncing:
  549. connection.send_command(
  550. UserSyncCommand(self._instance_id, user_id, True, now)
  551. )
  552. def lost_connection(self, connection: IReplicationConnection) -> None:
  553. """Called when a connection is closed/lost."""
  554. # we no longer need _streams_by_connection for this connection.
  555. streams = self._streams_by_connection.pop(connection, None)
  556. if streams:
  557. logger.info(
  558. "Lost replication connection; streams now disconnected: %s", streams
  559. )
  560. try:
  561. self._connections.remove(connection)
  562. except ValueError:
  563. pass
  564. def connected(self) -> bool:
  565. """Do we have any replication connections open?
  566. Is used by e.g. `ReplicationStreamer` to no-op if nothing is connected.
  567. """
  568. return bool(self._connections)
  569. def send_command(self, cmd: Command) -> None:
  570. """Send a command to all connected connections.
  571. Args:
  572. cmd
  573. """
  574. if self._connections:
  575. for connection in self._connections:
  576. try:
  577. connection.send_command(cmd)
  578. except Exception:
  579. # We probably want to catch some types of exceptions here
  580. # and log them as warnings (e.g. connection gone), but I
  581. # can't find what those exception types they would be.
  582. logger.exception(
  583. "Failed to write command %s to connection %s",
  584. cmd.NAME,
  585. connection,
  586. )
  587. else:
  588. logger.warning("Dropping command as not connected: %r", cmd.NAME)
  589. def send_federation_ack(self, token: int) -> None:
  590. """Ack data for the federation stream. This allows the master to drop
  591. data stored purely in memory.
  592. """
  593. self.send_command(FederationAckCommand(self._instance_name, token))
  594. def send_user_sync(
  595. self, instance_id: str, user_id: str, is_syncing: bool, last_sync_ms: int
  596. ) -> None:
  597. """Poke the master that a user has started/stopped syncing."""
  598. self.send_command(
  599. UserSyncCommand(instance_id, user_id, is_syncing, last_sync_ms)
  600. )
  601. def send_user_ip(
  602. self,
  603. user_id: str,
  604. access_token: str,
  605. ip: str,
  606. user_agent: str,
  607. device_id: Optional[str],
  608. last_seen: int,
  609. ) -> None:
  610. """Tell the master that the user made a request."""
  611. cmd = UserIpCommand(user_id, access_token, ip, user_agent, device_id, last_seen)
  612. self.send_command(cmd)
  613. def send_remote_server_up(self, server: str) -> None:
  614. self.send_command(RemoteServerUpCommand(server))
  615. def stream_update(self, stream_name: str, token: Optional[int], data: Any) -> None:
  616. """Called when a new update is available to stream to Redis subscribers.
  617. We need to check if the client is interested in the stream or not
  618. """
  619. self.send_command(RdataCommand(stream_name, self._instance_name, token, data))
  620. UpdateToken = TypeVar("UpdateToken")
  621. UpdateRow = TypeVar("UpdateRow")
  622. def _batch_updates(
  623. updates: Iterable[Tuple[UpdateToken, UpdateRow]]
  624. ) -> Iterator[Tuple[UpdateToken, List[UpdateRow]]]:
  625. """Collect stream updates with the same token together
  626. Given a series of updates returned by Stream.get_updates_since(), collects
  627. the updates which share the same stream_id together.
  628. For example:
  629. [(1, a), (1, b), (2, c), (3, d), (3, e)]
  630. becomes:
  631. [
  632. (1, [a, b]),
  633. (2, [c]),
  634. (3, [d, e]),
  635. ]
  636. """
  637. update_iter = iter(updates)
  638. first_update = next(update_iter, None)
  639. if first_update is None:
  640. # empty input
  641. return
  642. current_batch_token = first_update[0]
  643. current_batch = [first_update[1]]
  644. for token, row in update_iter:
  645. if token != current_batch_token:
  646. # different token to the previous row: flush the previous
  647. # batch and start anew
  648. yield current_batch_token, current_batch
  649. current_batch_token = token
  650. current_batch = []
  651. current_batch.append(row)
  652. # flush the final batch
  653. yield current_batch_token, current_batch