test_federation_catch_up.py 18 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480
  1. from typing import List, Tuple
  2. from unittest.mock import Mock
  3. from synapse.api.constants import EventTypes
  4. from synapse.events import EventBase
  5. from synapse.federation.sender import PerDestinationQueue, TransactionManager
  6. from synapse.federation.units import Edu
  7. from synapse.rest import admin
  8. from synapse.rest.client import login, room
  9. from synapse.types import JsonDict
  10. from synapse.util.retryutils import NotRetryingDestination
  11. from tests.test_utils import event_injection, make_awaitable
  12. from tests.unittest import FederatingHomeserverTestCase
  13. class FederationCatchUpTestCases(FederatingHomeserverTestCase):
  14. """
  15. Tests cases of catching up over federation.
  16. By default for test cases federation sending is disabled. This Test class has it
  17. re-enabled for the main process.
  18. """
  19. servlets = [
  20. admin.register_servlets,
  21. room.register_servlets,
  22. login.register_servlets,
  23. ]
  24. def make_homeserver(self, reactor, clock):
  25. return self.setup_test_homeserver(
  26. federation_transport_client=Mock(spec=["send_transaction"]),
  27. )
  28. def prepare(self, reactor, clock, hs):
  29. # stub out get_current_hosts_in_room
  30. state_handler = hs.get_state_handler()
  31. # This mock is crucial for destination_rooms to be populated.
  32. state_handler.get_current_hosts_in_room = Mock(
  33. return_value=make_awaitable(["test", "host2"])
  34. )
  35. # whenever send_transaction is called, record the pdu data
  36. self.pdus = []
  37. self.failed_pdus = []
  38. self.is_online = True
  39. self.hs.get_federation_transport_client().send_transaction.side_effect = (
  40. self.record_transaction
  41. )
  42. def default_config(self) -> JsonDict:
  43. config = super().default_config()
  44. config["federation_sender_instances"] = None
  45. return config
  46. async def record_transaction(self, txn, json_cb):
  47. if self.is_online:
  48. data = json_cb()
  49. self.pdus.extend(data["pdus"])
  50. return {}
  51. else:
  52. data = json_cb()
  53. self.failed_pdus.extend(data["pdus"])
  54. raise NotRetryingDestination(0, 24 * 60 * 60 * 1000, txn.destination)
  55. def get_destination_room(self, room: str, destination: str = "host2") -> dict:
  56. """
  57. Gets the destination_rooms entry for a (destination, room_id) pair.
  58. Args:
  59. room: room ID
  60. destination: what destination, default is "host2"
  61. Returns:
  62. Dictionary of { event_id: str, stream_ordering: int }
  63. """
  64. event_id, stream_ordering = self.get_success(
  65. self.hs.get_datastores().main.db_pool.execute(
  66. "test:get_destination_rooms",
  67. None,
  68. """
  69. SELECT event_id, stream_ordering
  70. FROM destination_rooms dr
  71. JOIN events USING (stream_ordering)
  72. WHERE dr.destination = ? AND dr.room_id = ?
  73. """,
  74. destination,
  75. room,
  76. )
  77. )[0]
  78. return {"event_id": event_id, "stream_ordering": stream_ordering}
  79. def test_catch_up_destination_rooms_tracking(self):
  80. """
  81. Tests that we populate the `destination_rooms` table as needed.
  82. """
  83. self.register_user("u1", "you the one")
  84. u1_token = self.login("u1", "you the one")
  85. room = self.helper.create_room_as("u1", tok=u1_token)
  86. self.get_success(
  87. event_injection.inject_member_event(self.hs, room, "@user:host2", "join")
  88. )
  89. event_id_1 = self.helper.send(room, "wombats!", tok=u1_token)["event_id"]
  90. row_1 = self.get_destination_room(room)
  91. event_id_2 = self.helper.send(room, "rabbits!", tok=u1_token)["event_id"]
  92. row_2 = self.get_destination_room(room)
  93. # check: events correctly registered in order
  94. self.assertEqual(row_1["event_id"], event_id_1)
  95. self.assertEqual(row_2["event_id"], event_id_2)
  96. self.assertEqual(row_1["stream_ordering"], row_2["stream_ordering"] - 1)
  97. def test_catch_up_last_successful_stream_ordering_tracking(self):
  98. """
  99. Tests that we populate the `destination_rooms` table as needed.
  100. """
  101. self.register_user("u1", "you the one")
  102. u1_token = self.login("u1", "you the one")
  103. room = self.helper.create_room_as("u1", tok=u1_token)
  104. # take the remote offline
  105. self.is_online = False
  106. self.get_success(
  107. event_injection.inject_member_event(self.hs, room, "@user:host2", "join")
  108. )
  109. self.helper.send(room, "wombats!", tok=u1_token)
  110. self.pump()
  111. lsso_1 = self.get_success(
  112. self.hs.get_datastores().main.get_destination_last_successful_stream_ordering(
  113. "host2"
  114. )
  115. )
  116. self.assertIsNone(
  117. lsso_1,
  118. "There should be no last successful stream ordering for an always-offline destination",
  119. )
  120. # bring the remote online
  121. self.is_online = True
  122. event_id_2 = self.helper.send(room, "rabbits!", tok=u1_token)["event_id"]
  123. lsso_2 = self.get_success(
  124. self.hs.get_datastores().main.get_destination_last_successful_stream_ordering(
  125. "host2"
  126. )
  127. )
  128. row_2 = self.get_destination_room(room)
  129. self.assertEqual(
  130. self.pdus[0]["content"]["body"],
  131. "rabbits!",
  132. "Test fault: didn't receive the right PDU",
  133. )
  134. self.assertEqual(
  135. row_2["event_id"],
  136. event_id_2,
  137. "Test fault: destination_rooms not updated correctly",
  138. )
  139. self.assertEqual(
  140. lsso_2,
  141. row_2["stream_ordering"],
  142. "Send succeeded but not marked as last_successful_stream_ordering",
  143. )
  144. def test_catch_up_from_blank_state(self):
  145. """
  146. Runs an overall test of federation catch-up from scratch.
  147. Further tests will focus on more narrow aspects and edge-cases, but I
  148. hope to provide an overall view with this test.
  149. """
  150. # bring the other server online
  151. self.is_online = True
  152. # let's make some events for the other server to receive
  153. self.register_user("u1", "you the one")
  154. u1_token = self.login("u1", "you the one")
  155. room_1 = self.helper.create_room_as("u1", tok=u1_token)
  156. room_2 = self.helper.create_room_as("u1", tok=u1_token)
  157. # also critical to federate
  158. self.get_success(
  159. event_injection.inject_member_event(self.hs, room_1, "@user:host2", "join")
  160. )
  161. self.get_success(
  162. event_injection.inject_member_event(self.hs, room_2, "@user:host2", "join")
  163. )
  164. self.helper.send_state(
  165. room_1, event_type="m.room.topic", body={"topic": "wombat"}, tok=u1_token
  166. )
  167. # check: PDU received for topic event
  168. self.assertEqual(len(self.pdus), 1)
  169. self.assertEqual(self.pdus[0]["type"], "m.room.topic")
  170. # take the remote offline
  171. self.is_online = False
  172. # send another event
  173. self.helper.send(room_1, "hi user!", tok=u1_token)
  174. # check: things didn't go well since the remote is down
  175. self.assertEqual(len(self.failed_pdus), 1)
  176. self.assertEqual(self.failed_pdus[0]["content"]["body"], "hi user!")
  177. # let's delete the federation transmission queue
  178. # (this pretends we are starting up fresh.)
  179. self.assertFalse(
  180. self.hs.get_federation_sender()
  181. ._per_destination_queues["host2"]
  182. .transmission_loop_running
  183. )
  184. del self.hs.get_federation_sender()._per_destination_queues["host2"]
  185. # let's also clear any backoffs
  186. self.get_success(
  187. self.hs.get_datastores().main.set_destination_retry_timings(
  188. "host2", None, 0, 0
  189. )
  190. )
  191. # bring the remote online and clear the received pdu list
  192. self.is_online = True
  193. self.pdus = []
  194. # now we need to initiate a federation transaction somehow…
  195. # to do that, let's send another event (because it's simple to do)
  196. # (do it to another room otherwise the catch-up logic decides it doesn't
  197. # need to catch up room_1 — something I overlooked when first writing
  198. # this test)
  199. self.helper.send(room_2, "wombats!", tok=u1_token)
  200. # we should now have received both PDUs
  201. self.assertEqual(len(self.pdus), 2)
  202. self.assertEqual(self.pdus[0]["content"]["body"], "hi user!")
  203. self.assertEqual(self.pdus[1]["content"]["body"], "wombats!")
  204. def make_fake_destination_queue(
  205. self, destination: str = "host2"
  206. ) -> Tuple[PerDestinationQueue, List[EventBase]]:
  207. """
  208. Makes a fake per-destination queue.
  209. """
  210. transaction_manager = TransactionManager(self.hs)
  211. per_dest_queue = PerDestinationQueue(self.hs, transaction_manager, destination)
  212. results_list = []
  213. async def fake_send(
  214. destination_tm: str,
  215. pending_pdus: List[EventBase],
  216. _pending_edus: List[Edu],
  217. ) -> bool:
  218. assert destination == destination_tm
  219. results_list.extend(pending_pdus)
  220. return True # success!
  221. transaction_manager.send_new_transaction = fake_send
  222. return per_dest_queue, results_list
  223. def test_catch_up_loop(self):
  224. """
  225. Tests the behaviour of _catch_up_transmission_loop.
  226. """
  227. # ARRANGE:
  228. # - a local user (u1)
  229. # - 3 rooms which u1 is joined to (and remote user @user:host2 is
  230. # joined to)
  231. # - some events (1 to 5) in those rooms
  232. # we have 'already sent' events 1 and 2 to host2
  233. per_dest_queue, sent_pdus = self.make_fake_destination_queue()
  234. self.register_user("u1", "you the one")
  235. u1_token = self.login("u1", "you the one")
  236. room_1 = self.helper.create_room_as("u1", tok=u1_token)
  237. room_2 = self.helper.create_room_as("u1", tok=u1_token)
  238. room_3 = self.helper.create_room_as("u1", tok=u1_token)
  239. self.get_success(
  240. event_injection.inject_member_event(self.hs, room_1, "@user:host2", "join")
  241. )
  242. self.get_success(
  243. event_injection.inject_member_event(self.hs, room_2, "@user:host2", "join")
  244. )
  245. self.get_success(
  246. event_injection.inject_member_event(self.hs, room_3, "@user:host2", "join")
  247. )
  248. # create some events
  249. self.helper.send(room_1, "you hear me!!", tok=u1_token)
  250. event_id_2 = self.helper.send(room_2, "wombats!", tok=u1_token)["event_id"]
  251. self.helper.send(room_3, "Matrix!", tok=u1_token)
  252. event_id_4 = self.helper.send(room_2, "rabbits!", tok=u1_token)["event_id"]
  253. event_id_5 = self.helper.send(room_3, "Synapse!", tok=u1_token)["event_id"]
  254. # destination_rooms should already be populated, but let us pretend that we already
  255. # sent (successfully) up to and including event id 2
  256. event_2 = self.get_success(self.hs.get_datastores().main.get_event(event_id_2))
  257. # also fetch event 5 so we know its last_successful_stream_ordering later
  258. event_5 = self.get_success(self.hs.get_datastores().main.get_event(event_id_5))
  259. self.get_success(
  260. self.hs.get_datastores().main.set_destination_last_successful_stream_ordering(
  261. "host2", event_2.internal_metadata.stream_ordering
  262. )
  263. )
  264. # ACT
  265. self.get_success(per_dest_queue._catch_up_transmission_loop())
  266. # ASSERT, noticing in particular:
  267. # - event 3 not sent out, because event 5 replaces it
  268. # - order is least recent first, so event 5 comes after event 4
  269. # - catch-up is completed
  270. self.assertEqual(len(sent_pdus), 2)
  271. self.assertEqual(sent_pdus[0].event_id, event_id_4)
  272. self.assertEqual(sent_pdus[1].event_id, event_id_5)
  273. self.assertFalse(per_dest_queue._catching_up)
  274. self.assertEqual(
  275. per_dest_queue._last_successful_stream_ordering,
  276. event_5.internal_metadata.stream_ordering,
  277. )
  278. def test_catch_up_on_synapse_startup(self):
  279. """
  280. Tests the behaviour of get_catch_up_outstanding_destinations and
  281. _wake_destinations_needing_catchup.
  282. """
  283. # list of sorted server names (note that there are more servers than the batch
  284. # size used in get_catch_up_outstanding_destinations).
  285. server_names = ["server%02d" % number for number in range(42)] + ["zzzerver"]
  286. # ARRANGE:
  287. # - a local user (u1)
  288. # - a room which u1 is joined to (and remote users @user:serverXX are
  289. # joined to)
  290. # mark the remotes as online
  291. self.is_online = True
  292. self.register_user("u1", "you the one")
  293. u1_token = self.login("u1", "you the one")
  294. room_id = self.helper.create_room_as("u1", tok=u1_token)
  295. for server_name in server_names:
  296. self.get_success(
  297. event_injection.inject_member_event(
  298. self.hs, room_id, "@user:%s" % server_name, "join"
  299. )
  300. )
  301. # create an event
  302. self.helper.send(room_id, "deary me!", tok=u1_token)
  303. # ASSERT:
  304. # - All servers are up to date so none should have outstanding catch-up
  305. outstanding_when_successful = self.get_success(
  306. self.hs.get_datastores().main.get_catch_up_outstanding_destinations(None)
  307. )
  308. self.assertEqual(outstanding_when_successful, [])
  309. # ACT:
  310. # - Make the remote servers unreachable
  311. self.is_online = False
  312. # - Mark zzzerver as being backed-off from
  313. now = self.clock.time_msec()
  314. self.get_success(
  315. self.hs.get_datastores().main.set_destination_retry_timings(
  316. "zzzerver", now, now, 24 * 60 * 60 * 1000 # retry in 1 day
  317. )
  318. )
  319. # - Send an event
  320. self.helper.send(room_id, "can anyone hear me?", tok=u1_token)
  321. # ASSERT (get_catch_up_outstanding_destinations):
  322. # - all remotes are outstanding
  323. # - they are returned in batches of 25, in order
  324. outstanding_1 = self.get_success(
  325. self.hs.get_datastores().main.get_catch_up_outstanding_destinations(None)
  326. )
  327. self.assertEqual(len(outstanding_1), 25)
  328. self.assertEqual(outstanding_1, server_names[0:25])
  329. outstanding_2 = self.get_success(
  330. self.hs.get_datastores().main.get_catch_up_outstanding_destinations(
  331. outstanding_1[-1]
  332. )
  333. )
  334. self.assertNotIn("zzzerver", outstanding_2)
  335. self.assertEqual(len(outstanding_2), 17)
  336. self.assertEqual(outstanding_2, server_names[25:-1])
  337. # ACT: call _wake_destinations_needing_catchup
  338. # patch wake_destination to just count the destinations instead
  339. woken = []
  340. def wake_destination_track(destination):
  341. woken.append(destination)
  342. self.hs.get_federation_sender().wake_destination = wake_destination_track
  343. # cancel the pre-existing timer for _wake_destinations_needing_catchup
  344. # this is because we are calling it manually rather than waiting for it
  345. # to be called automatically
  346. self.hs.get_federation_sender()._catchup_after_startup_timer.cancel()
  347. self.get_success(
  348. self.hs.get_federation_sender()._wake_destinations_needing_catchup(), by=5.0
  349. )
  350. # ASSERT (_wake_destinations_needing_catchup):
  351. # - all remotes are woken up, save for zzzerver
  352. self.assertNotIn("zzzerver", woken)
  353. # - all destinations are woken exactly once; they appear once in woken.
  354. self.assertCountEqual(woken, server_names[:-1])
  355. def test_not_latest_event(self):
  356. """Test that we send the latest event in the room even if its not ours."""
  357. per_dest_queue, sent_pdus = self.make_fake_destination_queue()
  358. # Make a room with a local user, and two servers. One will go offline
  359. # and one will send some events.
  360. self.register_user("u1", "you the one")
  361. u1_token = self.login("u1", "you the one")
  362. room_1 = self.helper.create_room_as("u1", tok=u1_token)
  363. self.get_success(
  364. event_injection.inject_member_event(self.hs, room_1, "@user:host2", "join")
  365. )
  366. event_1 = self.get_success(
  367. event_injection.inject_member_event(self.hs, room_1, "@user:host3", "join")
  368. )
  369. # First we send something from the local server, so that we notice the
  370. # remote is down and go into catchup mode.
  371. self.helper.send(room_1, "you hear me!!", tok=u1_token)
  372. # Now simulate us receiving an event from the still online remote.
  373. event_2 = self.get_success(
  374. event_injection.inject_event(
  375. self.hs,
  376. type=EventTypes.Message,
  377. sender="@user:host3",
  378. room_id=room_1,
  379. content={"msgtype": "m.text", "body": "Hello"},
  380. )
  381. )
  382. self.get_success(
  383. self.hs.get_datastores().main.set_destination_last_successful_stream_ordering(
  384. "host2", event_1.internal_metadata.stream_ordering
  385. )
  386. )
  387. self.get_success(per_dest_queue._catch_up_transmission_loop())
  388. # We expect only the last message from the remote, event_2, to have been
  389. # sent, rather than the last *local* event that was sent.
  390. self.assertEqual(len(sent_pdus), 1)
  391. self.assertEqual(sent_pdus[0].event_id, event_2.event_id)
  392. self.assertFalse(per_dest_queue._catching_up)