Browse Source

Make the metrics less racy (#4061)

Amber Brown 5 years ago
parent
commit
b69216f768
3 changed files with 22 additions and 16 deletions
  1. 1 0
      changelog.d/4061.bugfix
  2. 18 13
      synapse/http/request_metrics.py
  3. 3 3
      synapse/notifier.py

+ 1 - 0
changelog.d/4061.bugfix

@@ -0,0 +1 @@
+Fix some metrics being racy and causing exceptions when polled by Prometheus.

+ 18 - 13
synapse/http/request_metrics.py

@@ -39,7 +39,8 @@ outgoing_responses_counter = Counter(
 )
 )
 
 
 response_timer = Histogram(
 response_timer = Histogram(
-    "synapse_http_server_response_time_seconds", "sec",
+    "synapse_http_server_response_time_seconds",
+    "sec",
     ["method", "servlet", "tag", "code"],
     ["method", "servlet", "tag", "code"],
 )
 )
 
 
@@ -79,15 +80,11 @@ response_size = Counter(
 # than when the response was written.
 # than when the response was written.
 
 
 in_flight_requests_ru_utime = Counter(
 in_flight_requests_ru_utime = Counter(
-    "synapse_http_server_in_flight_requests_ru_utime_seconds",
-    "",
-    ["method", "servlet"],
+    "synapse_http_server_in_flight_requests_ru_utime_seconds", "", ["method", "servlet"]
 )
 )
 
 
 in_flight_requests_ru_stime = Counter(
 in_flight_requests_ru_stime = Counter(
-    "synapse_http_server_in_flight_requests_ru_stime_seconds",
-    "",
-    ["method", "servlet"],
+    "synapse_http_server_in_flight_requests_ru_stime_seconds", "", ["method", "servlet"]
 )
 )
 
 
 in_flight_requests_db_txn_count = Counter(
 in_flight_requests_db_txn_count = Counter(
@@ -134,7 +131,7 @@ def _get_in_flight_counts():
     # type
     # type
     counts = {}
     counts = {}
     for rm in reqs:
     for rm in reqs:
-        key = (rm.method, rm.name,)
+        key = (rm.method, rm.name)
         counts[key] = counts.get(key, 0) + 1
         counts[key] = counts.get(key, 0) + 1
 
 
     return counts
     return counts
@@ -175,7 +172,8 @@ class RequestMetrics(object):
             if context != self.start_context:
             if context != self.start_context:
                 logger.warn(
                 logger.warn(
                     "Context have unexpectedly changed %r, %r",
                     "Context have unexpectedly changed %r, %r",
-                    context, self.start_context
+                    context,
+                    self.start_context,
                 )
                 )
                 return
                 return
 
 
@@ -192,10 +190,10 @@ class RequestMetrics(object):
         resource_usage = context.get_resource_usage()
         resource_usage = context.get_resource_usage()
 
 
         response_ru_utime.labels(self.method, self.name, tag).inc(
         response_ru_utime.labels(self.method, self.name, tag).inc(
-            resource_usage.ru_utime,
+            resource_usage.ru_utime
         )
         )
         response_ru_stime.labels(self.method, self.name, tag).inc(
         response_ru_stime.labels(self.method, self.name, tag).inc(
-            resource_usage.ru_stime,
+            resource_usage.ru_stime
         )
         )
         response_db_txn_count.labels(self.method, self.name, tag).inc(
         response_db_txn_count.labels(self.method, self.name, tag).inc(
             resource_usage.db_txn_count
             resource_usage.db_txn_count
@@ -222,8 +220,15 @@ class RequestMetrics(object):
         diff = new_stats - self._request_stats
         diff = new_stats - self._request_stats
         self._request_stats = new_stats
         self._request_stats = new_stats
 
 
-        in_flight_requests_ru_utime.labels(self.method, self.name).inc(diff.ru_utime)
-        in_flight_requests_ru_stime.labels(self.method, self.name).inc(diff.ru_stime)
+        # max() is used since rapid use of ru_stime/ru_utime can end up with the
+        # count going backwards due to NTP, time smearing, fine-grained
+        # correction, or floating points. Who knows, really?
+        in_flight_requests_ru_utime.labels(self.method, self.name).inc(
+            max(diff.ru_utime, 0)
+        )
+        in_flight_requests_ru_stime.labels(self.method, self.name).inc(
+            max(diff.ru_stime, 0)
+        )
 
 
         in_flight_requests_db_txn_count.labels(self.method, self.name).inc(
         in_flight_requests_db_txn_count.labels(self.method, self.name).inc(
             diff.db_txn_count
             diff.db_txn_count

+ 3 - 3
synapse/notifier.py

@@ -186,9 +186,9 @@ class Notifier(object):
         def count_listeners():
         def count_listeners():
             all_user_streams = set()
             all_user_streams = set()
 
 
-            for x in self.room_to_user_streams.values():
+            for x in list(self.room_to_user_streams.values()):
                 all_user_streams |= x
                 all_user_streams |= x
-            for x in self.user_to_user_stream.values():
+            for x in list(self.user_to_user_stream.values()):
                 all_user_streams.add(x)
                 all_user_streams.add(x)
 
 
             return sum(stream.count_listeners() for stream in all_user_streams)
             return sum(stream.count_listeners() for stream in all_user_streams)
@@ -196,7 +196,7 @@ class Notifier(object):
 
 
         LaterGauge(
         LaterGauge(
             "synapse_notifier_rooms", "", [],
             "synapse_notifier_rooms", "", [],
-            lambda: count(bool, self.room_to_user_streams.values()),
+            lambda: count(bool, list(self.room_to_user_streams.values())),
         )
         )
         LaterGauge(
         LaterGauge(
             "synapse_notifier_users", "", [],
             "synapse_notifier_users", "", [],