_base.py 20 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607
  1. # Copyright 2017 Vector Creations Ltd
  2. # Copyright 2019 New Vector Ltd
  3. #
  4. # Licensed under the Apache License, Version 2.0 (the "License");
  5. # you may not use this file except in compliance with the License.
  6. # You may obtain a copy of the License at
  7. #
  8. # http://www.apache.org/licenses/LICENSE-2.0
  9. #
  10. # Unless required by applicable law or agreed to in writing, software
  11. # distributed under the License is distributed on an "AS IS" BASIS,
  12. # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  13. # See the License for the specific language governing permissions and
  14. # limitations under the License.
  15. import heapq
  16. import logging
  17. from collections import namedtuple
  18. from typing import (
  19. TYPE_CHECKING,
  20. Any,
  21. Awaitable,
  22. Callable,
  23. List,
  24. Optional,
  25. Tuple,
  26. TypeVar,
  27. )
  28. import attr
  29. from synapse.replication.http.streams import ReplicationGetStreamUpdates
  30. if TYPE_CHECKING:
  31. from synapse.server import HomeServer
  32. logger = logging.getLogger(__name__)
  33. # the number of rows to request from an update_function.
  34. _STREAM_UPDATE_TARGET_ROW_COUNT = 100
  35. # Some type aliases to make things a bit easier.
  36. # A stream position token
  37. Token = int
  38. # The type of a stream update row, after JSON deserialisation, but before
  39. # parsing with Stream.parse_row (which turns it into a `ROW_TYPE`). Normally it's
  40. # just a row from a database query, though this is dependent on the stream in question.
  41. #
  42. StreamRow = TypeVar("StreamRow", bound=Tuple)
  43. # The type returned by the update_function of a stream, as well as get_updates(),
  44. # get_updates_since, etc.
  45. #
  46. # It consists of a triplet `(updates, new_last_token, limited)`, where:
  47. # * `updates` is a list of `(token, row)` entries.
  48. # * `new_last_token` is the new position in stream.
  49. # * `limited` is whether there are more updates to fetch.
  50. #
  51. StreamUpdateResult = Tuple[List[Tuple[Token, StreamRow]], Token, bool]
  52. # The type of an update_function for a stream
  53. #
  54. # The arguments are:
  55. #
  56. # * instance_name: the writer of the stream
  57. # * from_token: the previous stream token: the starting point for fetching the
  58. # updates
  59. # * to_token: the new stream token: the point to get updates up to
  60. # * target_row_count: a target for the number of rows to be returned.
  61. #
  62. # The update_function is expected to return up to _approximately_ target_row_count rows.
  63. # If there are more updates available, it should set `limited` in the result, and
  64. # it will be called again to get the next batch.
  65. #
  66. UpdateFunction = Callable[[str, Token, Token, int], Awaitable[StreamUpdateResult]]
  67. class Stream:
  68. """Base class for the streams.
  69. Provides a `get_updates()` function that returns new updates since the last
  70. time it was called.
  71. """
  72. NAME: str # The name of the stream
  73. # The type of the row. Used by the default impl of parse_row.
  74. ROW_TYPE: Any = None
  75. @classmethod
  76. def parse_row(cls, row: StreamRow):
  77. """Parse a row received over replication
  78. By default, assumes that the row data is an array object and passes its contents
  79. to the constructor of the ROW_TYPE for this stream.
  80. Args:
  81. row: row data from the incoming RDATA command, after json decoding
  82. Returns:
  83. ROW_TYPE object for this stream
  84. """
  85. return cls.ROW_TYPE(*row)
  86. def __init__(
  87. self,
  88. local_instance_name: str,
  89. current_token_function: Callable[[str], Token],
  90. update_function: UpdateFunction,
  91. ):
  92. """Instantiate a Stream
  93. `current_token_function` and `update_function` are callbacks which
  94. should be implemented by subclasses.
  95. `current_token_function` takes an instance name, which is a writer to
  96. the stream, and returns the position in the stream of the writer (as
  97. viewed from the current process). On the writer process this is where
  98. the writer has successfully written up to, whereas on other processes
  99. this is the position which we have received updates up to over
  100. replication. (Note that most streams have a single writer and so their
  101. implementations ignore the instance name passed in).
  102. `update_function` is called to get updates for this stream between a
  103. pair of stream tokens. See the `UpdateFunction` type definition for more
  104. info.
  105. Args:
  106. local_instance_name: The instance name of the current process
  107. current_token_function: callback to get the current token, as above
  108. update_function: callback go get stream updates, as above
  109. """
  110. self.local_instance_name = local_instance_name
  111. self.current_token = current_token_function
  112. self.update_function = update_function
  113. # The token from which we last asked for updates
  114. self.last_token = self.current_token(self.local_instance_name)
  115. def discard_updates_and_advance(self):
  116. """Called when the stream should advance but the updates would be discarded,
  117. e.g. when there are no currently connected workers.
  118. """
  119. self.last_token = self.current_token(self.local_instance_name)
  120. async def get_updates(self) -> StreamUpdateResult:
  121. """Gets all updates since the last time this function was called (or
  122. since the stream was constructed if it hadn't been called before).
  123. Returns:
  124. A triplet `(updates, new_last_token, limited)`, where `updates` is
  125. a list of `(token, row)` entries, `new_last_token` is the new
  126. position in stream, and `limited` is whether there are more updates
  127. to fetch.
  128. """
  129. current_token = self.current_token(self.local_instance_name)
  130. updates, current_token, limited = await self.get_updates_since(
  131. self.local_instance_name, self.last_token, current_token
  132. )
  133. self.last_token = current_token
  134. return updates, current_token, limited
  135. async def get_updates_since(
  136. self, instance_name: str, from_token: Token, upto_token: Token
  137. ) -> StreamUpdateResult:
  138. """Like get_updates except allows specifying from when we should
  139. stream updates
  140. Returns:
  141. A triplet `(updates, new_last_token, limited)`, where `updates` is
  142. a list of `(token, row)` entries, `new_last_token` is the new
  143. position in stream, and `limited` is whether there are more updates
  144. to fetch.
  145. """
  146. from_token = int(from_token)
  147. if from_token == upto_token:
  148. return [], upto_token, False
  149. updates, upto_token, limited = await self.update_function(
  150. instance_name,
  151. from_token,
  152. upto_token,
  153. _STREAM_UPDATE_TARGET_ROW_COUNT,
  154. )
  155. return updates, upto_token, limited
  156. def current_token_without_instance(
  157. current_token: Callable[[], int]
  158. ) -> Callable[[str], int]:
  159. """Takes a current token callback function for a single writer stream
  160. that doesn't take an instance name parameter and wraps it in a function that
  161. does accept an instance name parameter but ignores it.
  162. """
  163. return lambda instance_name: current_token()
  164. def make_http_update_function(hs, stream_name: str) -> UpdateFunction:
  165. """Makes a suitable function for use as an `update_function` that queries
  166. the master process for updates.
  167. """
  168. client = ReplicationGetStreamUpdates.make_client(hs)
  169. async def update_function(
  170. instance_name: str, from_token: int, upto_token: int, limit: int
  171. ) -> StreamUpdateResult:
  172. result = await client(
  173. instance_name=instance_name,
  174. stream_name=stream_name,
  175. from_token=from_token,
  176. upto_token=upto_token,
  177. )
  178. return result["updates"], result["upto_token"], result["limited"]
  179. return update_function
  180. class BackfillStream(Stream):
  181. """We fetched some old events and either we had never seen that event before
  182. or it went from being an outlier to not.
  183. """
  184. BackfillStreamRow = namedtuple(
  185. "BackfillStreamRow",
  186. (
  187. "event_id", # str
  188. "room_id", # str
  189. "type", # str
  190. "state_key", # str, optional
  191. "redacts", # str, optional
  192. "relates_to", # str, optional
  193. ),
  194. )
  195. NAME = "backfill"
  196. ROW_TYPE = BackfillStreamRow
  197. def __init__(self, hs: "HomeServer"):
  198. self.store = hs.get_datastore()
  199. super().__init__(
  200. hs.get_instance_name(),
  201. self._current_token,
  202. self.store.get_all_new_backfill_event_rows,
  203. )
  204. def _current_token(self, instance_name: str) -> int:
  205. # The backfill stream over replication operates on *positive* numbers,
  206. # which means we need to negate it.
  207. return -self.store._backfill_id_gen.get_current_token_for_writer(instance_name)
  208. class PresenceStream(Stream):
  209. PresenceStreamRow = namedtuple(
  210. "PresenceStreamRow",
  211. (
  212. "user_id", # str
  213. "state", # str
  214. "last_active_ts", # int
  215. "last_federation_update_ts", # int
  216. "last_user_sync_ts", # int
  217. "status_msg", # str
  218. "currently_active", # bool
  219. ),
  220. )
  221. NAME = "presence"
  222. ROW_TYPE = PresenceStreamRow
  223. def __init__(self, hs: "HomeServer"):
  224. store = hs.get_datastore()
  225. if hs.get_instance_name() in hs.config.worker.writers.presence:
  226. # on the presence writer, query the presence handler
  227. presence_handler = hs.get_presence_handler()
  228. from synapse.handlers.presence import PresenceHandler
  229. assert isinstance(presence_handler, PresenceHandler)
  230. update_function: UpdateFunction = presence_handler.get_all_presence_updates
  231. else:
  232. # Query presence writer process
  233. update_function = make_http_update_function(hs, self.NAME)
  234. super().__init__(
  235. hs.get_instance_name(),
  236. current_token_without_instance(store.get_current_presence_token),
  237. update_function,
  238. )
  239. class PresenceFederationStream(Stream):
  240. """A stream used to send ad hoc presence updates over federation.
  241. Streams the remote destination and the user ID of the presence state to
  242. send.
  243. """
  244. @attr.s(slots=True, auto_attribs=True)
  245. class PresenceFederationStreamRow:
  246. destination: str
  247. user_id: str
  248. NAME = "presence_federation"
  249. ROW_TYPE = PresenceFederationStreamRow
  250. def __init__(self, hs: "HomeServer"):
  251. federation_queue = hs.get_presence_handler().get_federation_queue()
  252. super().__init__(
  253. hs.get_instance_name(),
  254. federation_queue.get_current_token,
  255. federation_queue.get_replication_rows,
  256. )
  257. class TypingStream(Stream):
  258. TypingStreamRow = namedtuple(
  259. "TypingStreamRow", ("room_id", "user_ids") # str # list(str)
  260. )
  261. NAME = "typing"
  262. ROW_TYPE = TypingStreamRow
  263. def __init__(self, hs: "HomeServer"):
  264. if hs.get_instance_name() in hs.config.worker.writers.typing:
  265. # On the writer, query the typing handler
  266. typing_writer_handler = hs.get_typing_writer_handler()
  267. update_function: Callable[
  268. [str, int, int, int], Awaitable[Tuple[List[Tuple[int, Any]], int, bool]]
  269. ] = typing_writer_handler.get_all_typing_updates
  270. current_token_function = typing_writer_handler.get_current_token
  271. else:
  272. # Query the typing writer process
  273. update_function = make_http_update_function(hs, self.NAME)
  274. current_token_function = hs.get_typing_handler().get_current_token
  275. super().__init__(
  276. hs.get_instance_name(),
  277. current_token_without_instance(current_token_function),
  278. update_function,
  279. )
  280. class ReceiptsStream(Stream):
  281. ReceiptsStreamRow = namedtuple(
  282. "ReceiptsStreamRow",
  283. (
  284. "room_id", # str
  285. "receipt_type", # str
  286. "user_id", # str
  287. "event_id", # str
  288. "data", # dict
  289. ),
  290. )
  291. NAME = "receipts"
  292. ROW_TYPE = ReceiptsStreamRow
  293. def __init__(self, hs: "HomeServer"):
  294. store = hs.get_datastore()
  295. super().__init__(
  296. hs.get_instance_name(),
  297. current_token_without_instance(store.get_max_receipt_stream_id),
  298. store.get_all_updated_receipts,
  299. )
  300. class PushRulesStream(Stream):
  301. """A user has changed their push rules"""
  302. PushRulesStreamRow = namedtuple("PushRulesStreamRow", ("user_id",)) # str
  303. NAME = "push_rules"
  304. ROW_TYPE = PushRulesStreamRow
  305. def __init__(self, hs: "HomeServer"):
  306. self.store = hs.get_datastore()
  307. super().__init__(
  308. hs.get_instance_name(),
  309. self._current_token,
  310. self.store.get_all_push_rule_updates,
  311. )
  312. def _current_token(self, instance_name: str) -> int:
  313. push_rules_token = self.store.get_max_push_rules_stream_id()
  314. return push_rules_token
  315. class PushersStream(Stream):
  316. """A user has added/changed/removed a pusher"""
  317. PushersStreamRow = namedtuple(
  318. "PushersStreamRow",
  319. ("user_id", "app_id", "pushkey", "deleted"), # str # str # str # bool
  320. )
  321. NAME = "pushers"
  322. ROW_TYPE = PushersStreamRow
  323. def __init__(self, hs: "HomeServer"):
  324. store = hs.get_datastore()
  325. super().__init__(
  326. hs.get_instance_name(),
  327. current_token_without_instance(store.get_pushers_stream_token),
  328. store.get_all_updated_pushers_rows,
  329. )
  330. class CachesStream(Stream):
  331. """A cache was invalidated on the master and no other stream would invalidate
  332. the cache on the workers
  333. """
  334. @attr.s(slots=True)
  335. class CachesStreamRow:
  336. """Stream to inform workers they should invalidate their cache.
  337. Attributes:
  338. cache_func: Name of the cached function.
  339. keys: The entry in the cache to invalidate. If None then will
  340. invalidate all.
  341. invalidation_ts: Timestamp of when the invalidation took place.
  342. """
  343. cache_func = attr.ib(type=str)
  344. keys = attr.ib(type=Optional[List[Any]])
  345. invalidation_ts = attr.ib(type=int)
  346. NAME = "caches"
  347. ROW_TYPE = CachesStreamRow
  348. def __init__(self, hs: "HomeServer"):
  349. store = hs.get_datastore()
  350. super().__init__(
  351. hs.get_instance_name(),
  352. store.get_cache_stream_token_for_writer,
  353. store.get_all_updated_caches,
  354. )
  355. class DeviceListsStream(Stream):
  356. """Either a user has updated their devices or a remote server needs to be
  357. told about a device update.
  358. """
  359. @attr.s(slots=True)
  360. class DeviceListsStreamRow:
  361. entity = attr.ib(type=str)
  362. NAME = "device_lists"
  363. ROW_TYPE = DeviceListsStreamRow
  364. def __init__(self, hs: "HomeServer"):
  365. store = hs.get_datastore()
  366. super().__init__(
  367. hs.get_instance_name(),
  368. current_token_without_instance(store.get_device_stream_token),
  369. store.get_all_device_list_changes_for_remotes,
  370. )
  371. class ToDeviceStream(Stream):
  372. """New to_device messages for a client"""
  373. ToDeviceStreamRow = namedtuple("ToDeviceStreamRow", ("entity",)) # str
  374. NAME = "to_device"
  375. ROW_TYPE = ToDeviceStreamRow
  376. def __init__(self, hs: "HomeServer"):
  377. store = hs.get_datastore()
  378. super().__init__(
  379. hs.get_instance_name(),
  380. current_token_without_instance(store.get_to_device_stream_token),
  381. store.get_all_new_device_messages,
  382. )
  383. class TagAccountDataStream(Stream):
  384. """Someone added/removed a tag for a room"""
  385. TagAccountDataStreamRow = namedtuple(
  386. "TagAccountDataStreamRow", ("user_id", "room_id", "data") # str # str # dict
  387. )
  388. NAME = "tag_account_data"
  389. ROW_TYPE = TagAccountDataStreamRow
  390. def __init__(self, hs: "HomeServer"):
  391. store = hs.get_datastore()
  392. super().__init__(
  393. hs.get_instance_name(),
  394. current_token_without_instance(store.get_max_account_data_stream_id),
  395. store.get_all_updated_tags,
  396. )
  397. class AccountDataStream(Stream):
  398. """Global or per room account data was changed"""
  399. AccountDataStreamRow = namedtuple(
  400. "AccountDataStreamRow",
  401. ("user_id", "room_id", "data_type"), # str # Optional[str] # str
  402. )
  403. NAME = "account_data"
  404. ROW_TYPE = AccountDataStreamRow
  405. def __init__(self, hs: "HomeServer"):
  406. self.store = hs.get_datastore()
  407. super().__init__(
  408. hs.get_instance_name(),
  409. current_token_without_instance(self.store.get_max_account_data_stream_id),
  410. self._update_function,
  411. )
  412. async def _update_function(
  413. self, instance_name: str, from_token: int, to_token: int, limit: int
  414. ) -> StreamUpdateResult:
  415. limited = False
  416. global_results = await self.store.get_updated_global_account_data(
  417. from_token, to_token, limit
  418. )
  419. # if the global results hit the limit, we'll need to limit the room results to
  420. # the same stream token.
  421. if len(global_results) >= limit:
  422. to_token = global_results[-1][0]
  423. limited = True
  424. room_results = await self.store.get_updated_room_account_data(
  425. from_token, to_token, limit
  426. )
  427. # likewise, if the room results hit the limit, limit the global results to
  428. # the same stream token.
  429. if len(room_results) >= limit:
  430. to_token = room_results[-1][0]
  431. limited = True
  432. # convert the global results to the right format, and limit them to the to_token
  433. # at the same time
  434. global_rows = (
  435. (stream_id, (user_id, None, account_data_type))
  436. for stream_id, user_id, account_data_type in global_results
  437. if stream_id <= to_token
  438. )
  439. # we know that the room_results are already limited to `to_token` so no need
  440. # for a check on `stream_id` here.
  441. room_rows = (
  442. (stream_id, (user_id, room_id, account_data_type))
  443. for stream_id, user_id, room_id, account_data_type in room_results
  444. )
  445. # We need to return a sorted list, so merge them together.
  446. #
  447. # Note: We order only by the stream ID to work around a bug where the
  448. # same stream ID could appear in both `global_rows` and `room_rows`,
  449. # leading to a comparison between the data tuples. The comparison could
  450. # fail due to attempting to compare the `room_id` which results in a
  451. # `TypeError` from comparing a `str` vs `None`.
  452. updates = list(heapq.merge(room_rows, global_rows, key=lambda row: row[0]))
  453. return updates, to_token, limited
  454. class GroupServerStream(Stream):
  455. GroupsStreamRow = namedtuple(
  456. "GroupsStreamRow",
  457. ("group_id", "user_id", "type", "content"), # str # str # str # dict
  458. )
  459. NAME = "groups"
  460. ROW_TYPE = GroupsStreamRow
  461. def __init__(self, hs: "HomeServer"):
  462. store = hs.get_datastore()
  463. super().__init__(
  464. hs.get_instance_name(),
  465. current_token_without_instance(store.get_group_stream_token),
  466. store.get_all_groups_changes,
  467. )
  468. class UserSignatureStream(Stream):
  469. """A user has signed their own device with their user-signing key"""
  470. UserSignatureStreamRow = namedtuple("UserSignatureStreamRow", ("user_id")) # str
  471. NAME = "user_signature"
  472. ROW_TYPE = UserSignatureStreamRow
  473. def __init__(self, hs: "HomeServer"):
  474. store = hs.get_datastore()
  475. super().__init__(
  476. hs.get_instance_name(),
  477. current_token_without_instance(store.get_device_stream_token),
  478. store.get_all_user_signature_changes_for_remotes,
  479. )