request_metrics.py 7.4 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243
  1. # -*- coding: utf-8 -*-
  2. # Copyright 2014-2016 OpenMarket Ltd
  3. # Copyright 2018 New Vector Ltd
  4. #
  5. # Licensed under the Apache License, Version 2.0 (the "License");
  6. # you may not use this file except in compliance with the License.
  7. # You may obtain a copy of the License at
  8. #
  9. # http://www.apache.org/licenses/LICENSE-2.0
  10. #
  11. # Unless required by applicable law or agreed to in writing, software
  12. # distributed under the License is distributed on an "AS IS" BASIS,
  13. # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  14. # See the License for the specific language governing permissions and
  15. # limitations under the License.
  16. import logging
  17. import threading
  18. from prometheus_client.core import Counter, Histogram
  19. from synapse.metrics import LaterGauge
  20. from synapse.util.logcontext import LoggingContext
  21. logger = logging.getLogger(__name__)
  22. # total number of responses served, split by method/servlet/tag
  23. response_count = Counter(
  24. "synapse_http_server_response_count", "", ["method", "servlet", "tag"]
  25. )
  26. requests_counter = Counter(
  27. "synapse_http_server_requests_received", "", ["method", "servlet"]
  28. )
  29. outgoing_responses_counter = Counter(
  30. "synapse_http_server_responses", "", ["method", "code"]
  31. )
  32. response_timer = Histogram(
  33. "synapse_http_server_response_time_seconds",
  34. "sec",
  35. ["method", "servlet", "tag", "code"],
  36. )
  37. response_ru_utime = Counter(
  38. "synapse_http_server_response_ru_utime_seconds", "sec", ["method", "servlet", "tag"]
  39. )
  40. response_ru_stime = Counter(
  41. "synapse_http_server_response_ru_stime_seconds", "sec", ["method", "servlet", "tag"]
  42. )
  43. response_db_txn_count = Counter(
  44. "synapse_http_server_response_db_txn_count", "", ["method", "servlet", "tag"]
  45. )
  46. # seconds spent waiting for db txns, excluding scheduling time, when processing
  47. # this request
  48. response_db_txn_duration = Counter(
  49. "synapse_http_server_response_db_txn_duration_seconds",
  50. "",
  51. ["method", "servlet", "tag"],
  52. )
  53. # seconds spent waiting for a db connection, when processing this request
  54. response_db_sched_duration = Counter(
  55. "synapse_http_server_response_db_sched_duration_seconds",
  56. "",
  57. ["method", "servlet", "tag"],
  58. )
  59. # size in bytes of the response written
  60. response_size = Counter(
  61. "synapse_http_server_response_size", "", ["method", "servlet", "tag"]
  62. )
  63. # In flight metrics are incremented while the requests are in flight, rather
  64. # than when the response was written.
  65. in_flight_requests_ru_utime = Counter(
  66. "synapse_http_server_in_flight_requests_ru_utime_seconds", "", ["method", "servlet"]
  67. )
  68. in_flight_requests_ru_stime = Counter(
  69. "synapse_http_server_in_flight_requests_ru_stime_seconds", "", ["method", "servlet"]
  70. )
  71. in_flight_requests_db_txn_count = Counter(
  72. "synapse_http_server_in_flight_requests_db_txn_count", "", ["method", "servlet"]
  73. )
  74. # seconds spent waiting for db txns, excluding scheduling time, when processing
  75. # this request
  76. in_flight_requests_db_txn_duration = Counter(
  77. "synapse_http_server_in_flight_requests_db_txn_duration_seconds",
  78. "",
  79. ["method", "servlet"],
  80. )
  81. # seconds spent waiting for a db connection, when processing this request
  82. in_flight_requests_db_sched_duration = Counter(
  83. "synapse_http_server_in_flight_requests_db_sched_duration_seconds",
  84. "",
  85. ["method", "servlet"],
  86. )
  87. # The set of all in flight requests, set[RequestMetrics]
  88. _in_flight_requests = set()
  89. # Protects the _in_flight_requests set from concurrent accesss
  90. _in_flight_requests_lock = threading.Lock()
  91. def _get_in_flight_counts():
  92. """Returns a count of all in flight requests by (method, server_name)
  93. Returns:
  94. dict[tuple[str, str], int]
  95. """
  96. # Cast to a list to prevent it changing while the Prometheus
  97. # thread is collecting metrics
  98. with _in_flight_requests_lock:
  99. reqs = list(_in_flight_requests)
  100. for rm in reqs:
  101. rm.update_metrics()
  102. # Map from (method, name) -> int, the number of in flight requests of that
  103. # type
  104. counts = {}
  105. for rm in reqs:
  106. key = (rm.method, rm.name)
  107. counts[key] = counts.get(key, 0) + 1
  108. return counts
  109. LaterGauge(
  110. "synapse_http_server_in_flight_requests_count",
  111. "",
  112. ["method", "servlet"],
  113. _get_in_flight_counts,
  114. )
  115. class RequestMetrics(object):
  116. def start(self, time_sec, name, method):
  117. self.start = time_sec
  118. self.start_context = LoggingContext.current_context()
  119. self.name = name
  120. self.method = method
  121. # _request_stats records resource usage that we have already added
  122. # to the "in flight" metrics.
  123. self._request_stats = self.start_context.get_resource_usage()
  124. with _in_flight_requests_lock:
  125. _in_flight_requests.add(self)
  126. def stop(self, time_sec, response_code, sent_bytes):
  127. with _in_flight_requests_lock:
  128. _in_flight_requests.discard(self)
  129. context = LoggingContext.current_context()
  130. tag = ""
  131. if context:
  132. tag = context.tag
  133. if context != self.start_context:
  134. logger.warn(
  135. "Context have unexpectedly changed %r, %r",
  136. context,
  137. self.start_context,
  138. )
  139. return
  140. response_code = str(response_code)
  141. outgoing_responses_counter.labels(self.method, response_code).inc()
  142. response_count.labels(self.method, self.name, tag).inc()
  143. response_timer.labels(self.method, self.name, tag, response_code).observe(
  144. time_sec - self.start
  145. )
  146. resource_usage = context.get_resource_usage()
  147. response_ru_utime.labels(self.method, self.name, tag).inc(
  148. resource_usage.ru_utime
  149. )
  150. response_ru_stime.labels(self.method, self.name, tag).inc(
  151. resource_usage.ru_stime
  152. )
  153. response_db_txn_count.labels(self.method, self.name, tag).inc(
  154. resource_usage.db_txn_count
  155. )
  156. response_db_txn_duration.labels(self.method, self.name, tag).inc(
  157. resource_usage.db_txn_duration_sec
  158. )
  159. response_db_sched_duration.labels(self.method, self.name, tag).inc(
  160. resource_usage.db_sched_duration_sec
  161. )
  162. response_size.labels(self.method, self.name, tag).inc(sent_bytes)
  163. # We always call this at the end to ensure that we update the metrics
  164. # regardless of whether a call to /metrics while the request was in
  165. # flight.
  166. self.update_metrics()
  167. def update_metrics(self):
  168. """Updates the in flight metrics with values from this request.
  169. """
  170. new_stats = self.start_context.get_resource_usage()
  171. diff = new_stats - self._request_stats
  172. self._request_stats = new_stats
  173. # max() is used since rapid use of ru_stime/ru_utime can end up with the
  174. # count going backwards due to NTP, time smearing, fine-grained
  175. # correction, or floating points. Who knows, really?
  176. in_flight_requests_ru_utime.labels(self.method, self.name).inc(
  177. max(diff.ru_utime, 0)
  178. )
  179. in_flight_requests_ru_stime.labels(self.method, self.name).inc(
  180. max(diff.ru_stime, 0)
  181. )
  182. in_flight_requests_db_txn_count.labels(self.method, self.name).inc(
  183. diff.db_txn_count
  184. )
  185. in_flight_requests_db_txn_duration.labels(self.method, self.name).inc(
  186. diff.db_txn_duration_sec
  187. )
  188. in_flight_requests_db_sched_duration.labels(self.method, self.name).inc(
  189. diff.db_sched_duration_sec
  190. )