metrics.py 21 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532
  1. # Copyright 2020 The Matrix.org Foundation C.I.C.
  2. #
  3. # Licensed under the Apache License, Version 2.0 (the "License");
  4. # you may not use this file except in compliance with the License.
  5. # You may obtain a copy of the License at
  6. #
  7. # http://www.apache.org/licenses/LICENSE-2.0
  8. #
  9. # Unless required by applicable law or agreed to in writing, software
  10. # distributed under the License is distributed on an "AS IS" BASIS,
  11. # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  12. # See the License for the specific language governing permissions and
  13. # limitations under the License.
  14. import calendar
  15. import logging
  16. import time
  17. from typing import TYPE_CHECKING, Dict, List, Tuple, cast
  18. from synapse.metrics import GaugeBucketCollector
  19. from synapse.metrics.background_process_metrics import wrap_as_background_process
  20. from synapse.storage._base import SQLBaseStore
  21. from synapse.storage.database import (
  22. DatabasePool,
  23. LoggingDatabaseConnection,
  24. LoggingTransaction,
  25. )
  26. from synapse.storage.databases.main.event_push_actions import (
  27. EventPushActionsWorkerStore,
  28. )
  29. if TYPE_CHECKING:
  30. from synapse.server import HomeServer
  31. logger = logging.getLogger(__name__)
  32. # Collect metrics on the number of forward extremities that exist.
  33. _extremities_collecter = GaugeBucketCollector(
  34. "synapse_forward_extremities",
  35. "Number of rooms on the server with the given number of forward extremities"
  36. " or fewer",
  37. buckets=[1, 2, 3, 5, 7, 10, 15, 20, 50, 100, 200, 500],
  38. )
  39. # we also expose metrics on the "number of excess extremity events", which is
  40. # (E-1)*N, where E is the number of extremities and N is the number of state
  41. # events in the room. This is an approximation to the number of state events
  42. # we could remove from state resolution by reducing the graph to a single
  43. # forward extremity.
  44. _excess_state_events_collecter = GaugeBucketCollector(
  45. "synapse_excess_extremity_events",
  46. "Number of rooms on the server with the given number of excess extremity "
  47. "events, or fewer",
  48. buckets=[0] + [1 << n for n in range(12)],
  49. )
  50. class ServerMetricsStore(EventPushActionsWorkerStore, SQLBaseStore):
  51. """Functions to pull various metrics from the DB, for e.g. phone home
  52. stats and prometheus metrics.
  53. """
  54. def __init__(
  55. self,
  56. database: DatabasePool,
  57. db_conn: LoggingDatabaseConnection,
  58. hs: "HomeServer",
  59. ):
  60. super().__init__(database, db_conn, hs)
  61. # Read the extrems every 60 minutes
  62. if hs.config.worker.run_background_tasks:
  63. self._clock.looping_call(self._read_forward_extremities, 60 * 60 * 1000)
  64. # Used in _generate_user_daily_visits to keep track of progress
  65. self._last_user_visit_update = self._get_start_of_day()
  66. @wrap_as_background_process("read_forward_extremities")
  67. async def _read_forward_extremities(self) -> None:
  68. def fetch(txn: LoggingTransaction) -> List[Tuple[int, int]]:
  69. txn.execute(
  70. """
  71. SELECT t1.c, t2.c
  72. FROM (
  73. SELECT room_id, COUNT(*) c FROM event_forward_extremities
  74. GROUP BY room_id
  75. ) t1 LEFT JOIN (
  76. SELECT room_id, COUNT(*) c FROM current_state_events
  77. GROUP BY room_id
  78. ) t2 ON t1.room_id = t2.room_id
  79. """
  80. )
  81. return cast(List[Tuple[int, int]], txn.fetchall())
  82. res = await self.db_pool.runInteraction("read_forward_extremities", fetch)
  83. _extremities_collecter.update_data(x[0] for x in res)
  84. _excess_state_events_collecter.update_data(
  85. (x[0] - 1) * x[1] for x in res if x[1]
  86. )
  87. async def count_daily_e2ee_messages(self) -> int:
  88. """
  89. Returns an estimate of the number of messages sent in the last day.
  90. If it has been significantly less or more than one day since the last
  91. call to this function, it will return None.
  92. """
  93. def _count_messages(txn: LoggingTransaction) -> int:
  94. sql = """
  95. SELECT COUNT(*) FROM events
  96. WHERE type = 'm.room.encrypted'
  97. AND stream_ordering > ?
  98. """
  99. txn.execute(sql, (self.stream_ordering_day_ago,))
  100. (count,) = cast(Tuple[int], txn.fetchone())
  101. return count
  102. return await self.db_pool.runInteraction("count_e2ee_messages", _count_messages)
  103. async def count_daily_sent_e2ee_messages(self) -> int:
  104. def _count_messages(txn: LoggingTransaction) -> int:
  105. # This is good enough as if you have silly characters in your own
  106. # hostname then that's your own fault.
  107. like_clause = "%:" + self.hs.hostname
  108. sql = """
  109. SELECT COUNT(*) FROM events
  110. WHERE type = 'm.room.encrypted'
  111. AND sender LIKE ?
  112. AND stream_ordering > ?
  113. """
  114. txn.execute(sql, (like_clause, self.stream_ordering_day_ago))
  115. (count,) = cast(Tuple[int], txn.fetchone())
  116. return count
  117. return await self.db_pool.runInteraction(
  118. "count_daily_sent_e2ee_messages", _count_messages
  119. )
  120. async def count_daily_active_e2ee_rooms(self) -> int:
  121. def _count(txn: LoggingTransaction) -> int:
  122. sql = """
  123. SELECT COUNT(DISTINCT room_id) FROM events
  124. WHERE type = 'm.room.encrypted'
  125. AND stream_ordering > ?
  126. """
  127. txn.execute(sql, (self.stream_ordering_day_ago,))
  128. (count,) = cast(Tuple[int], txn.fetchone())
  129. return count
  130. return await self.db_pool.runInteraction(
  131. "count_daily_active_e2ee_rooms", _count
  132. )
  133. async def count_daily_messages(self) -> int:
  134. """
  135. Returns an estimate of the number of messages sent in the last day.
  136. If it has been significantly less or more than one day since the last
  137. call to this function, it will return None.
  138. """
  139. def _count_messages(txn: LoggingTransaction) -> int:
  140. sql = """
  141. SELECT COUNT(*) FROM events
  142. WHERE type = 'm.room.message'
  143. AND stream_ordering > ?
  144. """
  145. txn.execute(sql, (self.stream_ordering_day_ago,))
  146. (count,) = cast(Tuple[int], txn.fetchone())
  147. return count
  148. return await self.db_pool.runInteraction("count_messages", _count_messages)
  149. async def count_daily_sent_messages(self) -> int:
  150. def _count_messages(txn: LoggingTransaction) -> int:
  151. # This is good enough as if you have silly characters in your own
  152. # hostname then that's your own fault.
  153. like_clause = "%:" + self.hs.hostname
  154. sql = """
  155. SELECT COUNT(*) FROM events
  156. WHERE type = 'm.room.message'
  157. AND sender LIKE ?
  158. AND stream_ordering > ?
  159. """
  160. txn.execute(sql, (like_clause, self.stream_ordering_day_ago))
  161. (count,) = cast(Tuple[int], txn.fetchone())
  162. return count
  163. return await self.db_pool.runInteraction(
  164. "count_daily_sent_messages", _count_messages
  165. )
  166. async def count_daily_active_rooms(self) -> int:
  167. def _count(txn: LoggingTransaction) -> int:
  168. sql = """
  169. SELECT COUNT(DISTINCT room_id) FROM events
  170. WHERE type = 'm.room.message'
  171. AND stream_ordering > ?
  172. """
  173. txn.execute(sql, (self.stream_ordering_day_ago,))
  174. (count,) = cast(Tuple[int], txn.fetchone())
  175. return count
  176. return await self.db_pool.runInteraction("count_daily_active_rooms", _count)
  177. async def count_daily_users(self) -> int:
  178. """
  179. Counts the number of users who used this homeserver in the last 24 hours.
  180. """
  181. yesterday = int(self._clock.time_msec()) - (1000 * 60 * 60 * 24)
  182. return await self.db_pool.runInteraction(
  183. "count_daily_users", self._count_users, yesterday
  184. )
  185. async def count_monthly_users(self) -> int:
  186. """
  187. Counts the number of users who used this homeserver in the last 30 days.
  188. Note this method is intended for phonehome metrics only and is different
  189. from the mau figure in synapse.storage.monthly_active_users which,
  190. amongst other things, includes a 3 day grace period before a user counts.
  191. """
  192. thirty_days_ago = int(self._clock.time_msec()) - (1000 * 60 * 60 * 24 * 30)
  193. return await self.db_pool.runInteraction(
  194. "count_monthly_users", self._count_users, thirty_days_ago
  195. )
  196. def _count_users(self, txn: LoggingTransaction, time_from: int) -> int:
  197. """
  198. Returns number of users seen in the past time_from period
  199. """
  200. sql = """
  201. SELECT COUNT(*) FROM (
  202. SELECT user_id FROM user_ips
  203. WHERE last_seen > ?
  204. GROUP BY user_id
  205. ) u
  206. """
  207. txn.execute(sql, (time_from,))
  208. # Mypy knows that fetchone() might return None if there are no rows.
  209. # We know better: "SELECT COUNT(...) FROM ..." without any GROUP BY always
  210. # returns exactly one row.
  211. (count,) = cast(Tuple[int], txn.fetchone())
  212. return count
  213. async def count_r30_users(self) -> Dict[str, int]:
  214. """
  215. Counts the number of 30 day retained users, defined as:-
  216. * Users who have created their accounts more than 30 days ago
  217. * Where last seen at most 30 days ago
  218. * Where account creation and last_seen are > 30 days apart
  219. Returns:
  220. A mapping of counts globally as well as broken out by platform.
  221. """
  222. def _count_r30_users(txn: LoggingTransaction) -> Dict[str, int]:
  223. thirty_days_in_secs = 86400 * 30
  224. now = int(self._clock.time())
  225. thirty_days_ago_in_secs = now - thirty_days_in_secs
  226. sql = """
  227. SELECT platform, COUNT(*) FROM (
  228. SELECT
  229. users.name, platform, users.creation_ts * 1000,
  230. MAX(uip.last_seen)
  231. FROM users
  232. INNER JOIN (
  233. SELECT
  234. user_id,
  235. last_seen,
  236. CASE
  237. WHEN user_agent LIKE '%%Android%%' THEN 'android'
  238. WHEN user_agent LIKE '%%iOS%%' THEN 'ios'
  239. WHEN user_agent LIKE '%%Electron%%' THEN 'electron'
  240. WHEN user_agent LIKE '%%Mozilla%%' THEN 'web'
  241. WHEN user_agent LIKE '%%Gecko%%' THEN 'web'
  242. ELSE 'unknown'
  243. END
  244. AS platform
  245. FROM user_ips
  246. ) uip
  247. ON users.name = uip.user_id
  248. AND users.appservice_id is NULL
  249. AND users.creation_ts < ?
  250. AND uip.last_seen/1000 > ?
  251. AND (uip.last_seen/1000) - users.creation_ts > 86400 * 30
  252. GROUP BY users.name, platform, users.creation_ts
  253. ) u GROUP BY platform
  254. """
  255. results = {}
  256. txn.execute(sql, (thirty_days_ago_in_secs, thirty_days_ago_in_secs))
  257. for row in txn:
  258. if row[0] == "unknown":
  259. pass
  260. results[row[0]] = row[1]
  261. sql = """
  262. SELECT COUNT(*) FROM (
  263. SELECT users.name, users.creation_ts * 1000,
  264. MAX(uip.last_seen)
  265. FROM users
  266. INNER JOIN (
  267. SELECT
  268. user_id,
  269. last_seen
  270. FROM user_ips
  271. ) uip
  272. ON users.name = uip.user_id
  273. AND appservice_id is NULL
  274. AND users.creation_ts < ?
  275. AND uip.last_seen/1000 > ?
  276. AND (uip.last_seen/1000) - users.creation_ts > 86400 * 30
  277. GROUP BY users.name, users.creation_ts
  278. ) u
  279. """
  280. txn.execute(sql, (thirty_days_ago_in_secs, thirty_days_ago_in_secs))
  281. (count,) = cast(Tuple[int], txn.fetchone())
  282. results["all"] = count
  283. return results
  284. return await self.db_pool.runInteraction("count_r30_users", _count_r30_users)
  285. async def count_r30v2_users(self) -> Dict[str, int]:
  286. """
  287. Counts the number of 30 day retained users, defined as users that:
  288. - Appear more than once in the past 60 days
  289. - Have more than 30 days between the most and least recent appearances that
  290. occurred in the past 60 days.
  291. (This is the second version of this metric, hence R30'v2')
  292. Returns:
  293. A mapping from client type to the number of 30-day retained users for that client.
  294. The dict keys are:
  295. - "all" (a combined number of users across any and all clients)
  296. - "android" (Element Android)
  297. - "ios" (Element iOS)
  298. - "electron" (Element Desktop)
  299. - "web" (any web application -- it's not possible to distinguish Element Web here)
  300. """
  301. def _count_r30v2_users(txn: LoggingTransaction) -> Dict[str, int]:
  302. thirty_days_in_secs = 86400 * 30
  303. now = int(self._clock.time())
  304. sixty_days_ago_in_secs = now - 2 * thirty_days_in_secs
  305. one_day_from_now_in_secs = now + 86400
  306. # This is the 'per-platform' count.
  307. sql = """
  308. SELECT
  309. client_type,
  310. count(client_type)
  311. FROM
  312. (
  313. SELECT
  314. user_id,
  315. CASE
  316. WHEN
  317. LOWER(user_agent) LIKE '%%riot%%' OR
  318. LOWER(user_agent) LIKE '%%element%%'
  319. THEN CASE
  320. WHEN
  321. LOWER(user_agent) LIKE '%%electron%%'
  322. THEN 'electron'
  323. WHEN
  324. LOWER(user_agent) LIKE '%%android%%'
  325. THEN 'android'
  326. WHEN
  327. LOWER(user_agent) LIKE '%%ios%%'
  328. THEN 'ios'
  329. ELSE 'unknown'
  330. END
  331. WHEN
  332. LOWER(user_agent) LIKE '%%mozilla%%' OR
  333. LOWER(user_agent) LIKE '%%gecko%%'
  334. THEN 'web'
  335. ELSE 'unknown'
  336. END as client_type
  337. FROM
  338. user_daily_visits
  339. WHERE
  340. timestamp > ?
  341. AND
  342. timestamp < ?
  343. GROUP BY
  344. user_id,
  345. client_type
  346. HAVING
  347. max(timestamp) - min(timestamp) > ?
  348. ) AS temp
  349. GROUP BY
  350. client_type
  351. ;
  352. """
  353. # We initialise all the client types to zero, so we get an explicit
  354. # zero if they don't appear in the query results
  355. results = {"ios": 0, "android": 0, "web": 0, "electron": 0}
  356. txn.execute(
  357. sql,
  358. (
  359. sixty_days_ago_in_secs * 1000,
  360. one_day_from_now_in_secs * 1000,
  361. thirty_days_in_secs * 1000,
  362. ),
  363. )
  364. for row in txn:
  365. if row[0] == "unknown":
  366. continue
  367. results[row[0]] = row[1]
  368. # This is the 'all users' count.
  369. sql = """
  370. SELECT COUNT(*) FROM (
  371. SELECT
  372. 1
  373. FROM
  374. user_daily_visits
  375. WHERE
  376. timestamp > ?
  377. AND
  378. timestamp < ?
  379. GROUP BY
  380. user_id
  381. HAVING
  382. max(timestamp) - min(timestamp) > ?
  383. ) AS r30_users
  384. """
  385. txn.execute(
  386. sql,
  387. (
  388. sixty_days_ago_in_secs * 1000,
  389. one_day_from_now_in_secs * 1000,
  390. thirty_days_in_secs * 1000,
  391. ),
  392. )
  393. (count,) = cast(Tuple[int], txn.fetchone())
  394. results["all"] = count
  395. return results
  396. return await self.db_pool.runInteraction(
  397. "count_r30v2_users", _count_r30v2_users
  398. )
  399. def _get_start_of_day(self) -> int:
  400. """
  401. Returns millisecond unixtime for start of UTC day.
  402. """
  403. now = time.gmtime(self._clock.time())
  404. today_start = calendar.timegm((now.tm_year, now.tm_mon, now.tm_mday, 0, 0, 0))
  405. return today_start * 1000
  406. @wrap_as_background_process("generate_user_daily_visits")
  407. async def generate_user_daily_visits(self) -> None:
  408. """
  409. Generates daily visit data for use in cohort/ retention analysis
  410. """
  411. def _generate_user_daily_visits(txn: LoggingTransaction) -> None:
  412. logger.info("Calling _generate_user_daily_visits")
  413. today_start = self._get_start_of_day()
  414. a_day_in_milliseconds = 24 * 60 * 60 * 1000
  415. now = self._clock.time_msec()
  416. # A note on user_agent. Technically a given device can have multiple
  417. # user agents, so we need to decide which one to pick. We could have
  418. # handled this in number of ways, but given that we don't care
  419. # _that_ much we have gone for MAX(). For more details of the other
  420. # options considered see
  421. # https://github.com/matrix-org/synapse/pull/8503#discussion_r502306111
  422. sql = """
  423. INSERT INTO user_daily_visits (user_id, device_id, timestamp, user_agent)
  424. SELECT u.user_id, u.device_id, ?, MAX(u.user_agent)
  425. FROM user_ips AS u
  426. LEFT JOIN (
  427. SELECT user_id, device_id, timestamp FROM user_daily_visits
  428. WHERE timestamp = ?
  429. ) udv
  430. ON u.user_id = udv.user_id AND u.device_id=udv.device_id
  431. INNER JOIN users ON users.name=u.user_id
  432. WHERE ? <= last_seen AND last_seen < ?
  433. AND udv.timestamp IS NULL AND users.is_guest=0
  434. AND users.appservice_id IS NULL
  435. GROUP BY u.user_id, u.device_id
  436. """
  437. # This means that the day has rolled over but there could still
  438. # be entries from the previous day. There is an edge case
  439. # where if the user logs in at 23:59 and overwrites their
  440. # last_seen at 00:01 then they will not be counted in the
  441. # previous day's stats - it is important that the query is run
  442. # often to minimise this case.
  443. if today_start > self._last_user_visit_update:
  444. yesterday_start = today_start - a_day_in_milliseconds
  445. txn.execute(
  446. sql,
  447. (
  448. yesterday_start,
  449. yesterday_start,
  450. self._last_user_visit_update,
  451. today_start,
  452. ),
  453. )
  454. self._last_user_visit_update = today_start
  455. txn.execute(
  456. sql, (today_start, today_start, self._last_user_visit_update, now)
  457. )
  458. # Update _last_user_visit_update to now. The reason to do this
  459. # rather just clamping to the beginning of the day is to limit
  460. # the size of the join - meaning that the query can be run more
  461. # frequently
  462. self._last_user_visit_update = now
  463. await self.db_pool.runInteraction(
  464. "generate_user_daily_visits", _generate_user_daily_visits
  465. )