Browse Source

Refactor logcontext resource usage tracking (#3501)

Factor out the resource usage tracking out to a separate object, which can be
passed around and copied independently of the logcontext itself.
Richard van der Hoff 5 years ago
parent
commit
55370331da
5 changed files with 146 additions and 130 deletions
  1. 0 0
      changelog.d/3501.misc
  2. 17 66
      synapse/http/request_metrics.py
  3. 9 15
      synapse/http/site.py
  4. 111 33
      synapse/util/logcontext.py
  5. 9 16
      synapse/util/metrics.py

+ 0 - 0
changelog.d/3501.misc


+ 17 - 66
synapse/http/request_metrics.py

@@ -149,7 +149,9 @@ class RequestMetrics(object):
         self.name = name
         self.method = method
 
-        self._request_stats = _RequestStats.from_context(self.start_context)
+        # _request_stats records resource usage that we have already added
+        # to the "in flight" metrics.
+        self._request_stats = self.start_context.get_resource_usage()
 
         _in_flight_requests.add(self)
 
@@ -177,18 +179,22 @@ class RequestMetrics(object):
             time_sec - self.start
         )
 
-        ru_utime, ru_stime = context.get_resource_usage()
+        resource_usage = context.get_resource_usage()
 
-        response_ru_utime.labels(request.method, self.name, tag).inc(ru_utime)
-        response_ru_stime.labels(request.method, self.name, tag).inc(ru_stime)
+        response_ru_utime.labels(request.method, self.name, tag).inc(
+            resource_usage.ru_utime,
+        )
+        response_ru_stime.labels(request.method, self.name, tag).inc(
+            resource_usage.ru_stime,
+        )
         response_db_txn_count.labels(request.method, self.name, tag).inc(
-            context.db_txn_count
+            resource_usage.db_txn_count
         )
         response_db_txn_duration.labels(request.method, self.name, tag).inc(
-            context.db_txn_duration_sec
+            resource_usage.db_txn_duration_sec
         )
         response_db_sched_duration.labels(request.method, self.name, tag).inc(
-            context.db_sched_duration_sec
+            resource_usage.db_sched_duration_sec
         )
 
         response_size.labels(request.method, self.name, tag).inc(request.sentLength)
@@ -201,7 +207,10 @@ class RequestMetrics(object):
     def update_metrics(self):
         """Updates the in flight metrics with values from this request.
         """
-        diff = self._request_stats.update(self.start_context)
+        new_stats = self.start_context.get_resource_usage()
+
+        diff = new_stats - self._request_stats
+        self._request_stats = new_stats
 
         in_flight_requests_ru_utime.labels(self.method, self.name).inc(diff.ru_utime)
         in_flight_requests_ru_stime.labels(self.method, self.name).inc(diff.ru_stime)
@@ -217,61 +226,3 @@ class RequestMetrics(object):
         in_flight_requests_db_sched_duration.labels(self.method, self.name).inc(
             diff.db_sched_duration_sec
         )
-
-
-class _RequestStats(object):
-    """Keeps tracks of various metrics for an in flight request.
-    """
-
-    __slots__ = [
-        "ru_utime",
-        "ru_stime",
-        "db_txn_count",
-        "db_txn_duration_sec",
-        "db_sched_duration_sec",
-    ]
-
-    def __init__(
-        self, ru_utime, ru_stime, db_txn_count, db_txn_duration_sec, db_sched_duration_sec
-    ):
-        self.ru_utime = ru_utime
-        self.ru_stime = ru_stime
-        self.db_txn_count = db_txn_count
-        self.db_txn_duration_sec = db_txn_duration_sec
-        self.db_sched_duration_sec = db_sched_duration_sec
-
-    @staticmethod
-    def from_context(context):
-        ru_utime, ru_stime = context.get_resource_usage()
-
-        return _RequestStats(
-            ru_utime, ru_stime,
-            context.db_txn_count,
-            context.db_txn_duration_sec,
-            context.db_sched_duration_sec,
-        )
-
-    def update(self, context):
-        """Updates the current values and returns the difference between the
-        old and new values.
-
-        Returns:
-            _RequestStats: The difference between the old and new values
-        """
-        new = _RequestStats.from_context(context)
-
-        diff = _RequestStats(
-            new.ru_utime - self.ru_utime,
-            new.ru_stime - self.ru_stime,
-            new.db_txn_count - self.db_txn_count,
-            new.db_txn_duration_sec - self.db_txn_duration_sec,
-            new.db_sched_duration_sec - self.db_sched_duration_sec,
-        )
-
-        self.ru_utime = new.ru_utime
-        self.ru_stime = new.ru_stime
-        self.db_txn_count = new.db_txn_count
-        self.db_txn_duration_sec = new.db_txn_duration_sec
-        self.db_sched_duration_sec = new.db_sched_duration_sec
-
-        return diff

+ 9 - 15
synapse/http/site.py

@@ -20,7 +20,7 @@ from twisted.web.server import Request, Site
 
 from synapse.http import redact_uri
 from synapse.http.request_metrics import RequestMetrics
-from synapse.util.logcontext import LoggingContext
+from synapse.util.logcontext import LoggingContext, ContextResourceUsage
 
 logger = logging.getLogger(__name__)
 
@@ -95,15 +95,9 @@ class SynapseRequest(Request):
     def _finished_processing(self):
         try:
             context = LoggingContext.current_context()
-            ru_utime, ru_stime = context.get_resource_usage()
-            db_txn_count = context.db_txn_count
-            db_txn_duration_sec = context.db_txn_duration_sec
-            db_sched_duration_sec = context.db_sched_duration_sec
-            evt_db_fetch_count = context.evt_db_fetch_count
+            usage = context.get_resource_usage()
         except Exception:
-            ru_utime, ru_stime = (0, 0)
-            db_txn_count, db_txn_duration_sec = (0, 0)
-            evt_db_fetch_count = 0
+            usage = ContextResourceUsage()
 
         end_time = time.time()
 
@@ -130,18 +124,18 @@ class SynapseRequest(Request):
             self.site.site_tag,
             authenticated_entity,
             end_time - self.start_time,
-            ru_utime,
-            ru_stime,
-            db_sched_duration_sec,
-            db_txn_duration_sec,
-            int(db_txn_count),
+            usage.ru_utime,
+            usage.ru_stime,
+            usage.db_sched_duration_sec,
+            usage.db_txn_duration_sec,
+            int(usage.db_txn_count),
             self.sentLength,
             self.code,
             self.method,
             self.get_redacted_uri(),
             self.clientproto,
             user_agent,
-            evt_db_fetch_count,
+            usage.evt_db_fetch_count,
         )
 
         try:

+ 111 - 33
synapse/util/logcontext.py

@@ -49,6 +49,90 @@ except Exception:
         return None
 
 
+class ContextResourceUsage(object):
+    """Object for tracking the resources used by a log context
+
+    Attributes:
+        ru_utime (float): user CPU time (in seconds)
+        ru_stime (float): system CPU time (in seconds)
+        db_txn_count (int): number of database transactions done
+        db_sched_duration_sec (float): amount of time spent waiting for a
+            database connection
+        db_txn_duration_sec (float): amount of time spent doing database
+            transactions (excluding scheduling time)
+        evt_db_fetch_count (int): number of events requested from the database
+    """
+
+    __slots__ = [
+        "ru_stime", "ru_utime",
+        "db_txn_count", "db_txn_duration_sec", "db_sched_duration_sec",
+        "evt_db_fetch_count",
+    ]
+
+    def __init__(self, copy_from=None):
+        """Create a new ContextResourceUsage
+
+        Args:
+            copy_from (ContextResourceUsage|None): if not None, an object to
+                copy stats from
+        """
+        if copy_from is None:
+            self.reset()
+        else:
+            self.ru_utime = copy_from.ru_utime
+            self.ru_stime = copy_from.ru_stime
+            self.db_txn_count = copy_from.db_txn_count
+
+            self.db_txn_duration_sec = copy_from.db_txn_duration_sec
+            self.db_sched_duration_sec = copy_from.db_sched_duration_sec
+            self.evt_db_fetch_count = copy_from.evt_db_fetch_count
+
+    def copy(self):
+        return ContextResourceUsage(copy_from=self)
+
+    def reset(self):
+        self.ru_stime = 0.
+        self.ru_utime = 0.
+        self.db_txn_count = 0
+
+        self.db_txn_duration_sec = 0
+        self.db_sched_duration_sec = 0
+        self.evt_db_fetch_count = 0
+
+    def __iadd__(self, other):
+        """Add another ContextResourceUsage's stats to this one's.
+
+        Args:
+            other (ContextResourceUsage): the other resource usage object
+        """
+        self.ru_utime += other.ru_utime
+        self.ru_stime += other.ru_stime
+        self.db_txn_count += other.db_txn_count
+        self.db_txn_duration_sec += other.db_txn_duration_sec
+        self.db_sched_duration_sec += other.db_sched_duration_sec
+        self.evt_db_fetch_count += other.evt_db_fetch_count
+        return self
+
+    def __isub__(self, other):
+        self.ru_utime -= other.ru_utime
+        self.ru_stime -= other.ru_stime
+        self.db_txn_count -= other.db_txn_count
+        self.db_txn_duration_sec -= other.db_txn_duration_sec
+        self.db_sched_duration_sec -= other.db_sched_duration_sec
+        self.evt_db_fetch_count -= other.evt_db_fetch_count
+        return self
+
+    def __add__(self, other):
+        res = ContextResourceUsage(copy_from=self)
+        res += other
+        return res
+
+    def __sub__(self, other):
+        res = ContextResourceUsage(copy_from=self)
+        res -= other
+        return res
+
+
 class LoggingContext(object):
     """Additional context for log formatting. Contexts are scoped within a
     "with" block.
@@ -58,9 +142,8 @@ class LoggingContext(object):
     """
 
     __slots__ = [
-        "previous_context", "name", "ru_stime", "ru_utime",
-        "db_txn_count", "db_txn_duration_sec", "db_sched_duration_sec",
-        "evt_db_fetch_count",
+        "previous_context", "name",
+        "_resource_usage",
         "usage_start",
         "main_thread", "alive",
         "request", "tag",
@@ -103,18 +186,9 @@ class LoggingContext(object):
     def __init__(self, name=None):
         self.previous_context = LoggingContext.current_context()
         self.name = name
-        self.ru_stime = 0.
-        self.ru_utime = 0.
-        self.db_txn_count = 0
 
-        # sec spent waiting for db txns, excluding scheduling time
-        self.db_txn_duration_sec = 0
-
-        # sec spent waiting for db txns to be scheduled
-        self.db_sched_duration_sec = 0
-
-        # number of events this thread has fetched from the db
-        self.evt_db_fetch_count = 0
+        # track the resources used by this context so far
+        self._resource_usage = ContextResourceUsage()
 
         # If alive has the thread resource usage when the logcontext last
         # became active.
@@ -207,39 +281,43 @@ class LoggingContext(object):
             logger.warning("Stopped logcontext %s on different thread", self)
             return
 
-        # When we stop, let's record the resource used since we started
-        if self.usage_start:
-            usage_end = get_thread_resource_usage()
+        # When we stop, let's record the cpu used since we started
+        if not self.usage_start:
+            logger.warning(
+                "Called stop on logcontext %s without calling start", self,
+            )
+            return
+
+        usage_end = get_thread_resource_usage()
 
-            self.ru_utime += usage_end.ru_utime - self.usage_start.ru_utime
-            self.ru_stime += usage_end.ru_stime - self.usage_start.ru_stime
+        self._resource_usage.ru_utime += usage_end.ru_utime - self.usage_start.ru_utime
+        self._resource_usage.ru_stime += usage_end.ru_stime - self.usage_start.ru_stime
 
-            self.usage_start = None
-        else:
-            logger.warning("Called stop on logcontext %s without calling start", self)
+        self.usage_start = None
 
     def get_resource_usage(self):
-        """Get CPU time used by this logcontext so far.
+        """Get resources used by this logcontext so far.
 
         Returns:
-            tuple[float, float]: The user and system CPU usage in seconds
+            ContextResourceUsage: a *copy* of the object tracking resource
+                usage so far
         """
-        ru_utime = self.ru_utime
-        ru_stime = self.ru_stime
+        # we always return a copy, for consistency
+        res = self._resource_usage.copy()
 
         # If we are on the correct thread and we're currently running then we
         # can include resource usage so far.
         is_main_thread = threading.current_thread() is self.main_thread
         if self.alive and self.usage_start and is_main_thread:
             current = get_thread_resource_usage()
-            ru_utime += current.ru_utime - self.usage_start.ru_utime
-            ru_stime += current.ru_stime - self.usage_start.ru_stime
+            res.ru_utime += current.ru_utime - self.usage_start.ru_utime
+            res.ru_stime += current.ru_stime - self.usage_start.ru_stime
 
-        return ru_utime, ru_stime
+        return res
 
     def add_database_transaction(self, duration_sec):
-        self.db_txn_count += 1
-        self.db_txn_duration_sec += duration_sec
+        self._resource_usage.db_txn_count += 1
+        self._resource_usage.db_txn_duration_sec += duration_sec
 
     def add_database_scheduled(self, sched_sec):
         """Record a use of the database pool
@@ -248,7 +326,7 @@ class LoggingContext(object):
             sched_sec (float): number of seconds it took us to get a
                 connection
         """
-        self.db_sched_duration_sec += sched_sec
+        self._resource_usage.db_sched_duration_sec += sched_sec
 
     def record_event_fetch(self, event_count):
         """Record a number of events being fetched from the db
@@ -256,7 +334,7 @@ class LoggingContext(object):
         Args:
             event_count (int): number of events being fetched
         """
-        self.evt_db_fetch_count += event_count
+        self._resource_usage.evt_db_fetch_count += event_count
 
 
 class LoggingContextFilter(logging.Filter):

+ 9 - 16
synapse/util/metrics.py

@@ -60,10 +60,9 @@ def measure_func(name):
 
 class Measure(object):
     __slots__ = [
-        "clock", "name", "start_context", "start", "new_context", "ru_utime",
-        "ru_stime",
-        "db_txn_count", "db_txn_duration_sec", "db_sched_duration_sec",
+        "clock", "name", "start_context", "start",
         "created_context",
+        "start_usage",
     ]
 
     def __init__(self, clock, name):
@@ -81,10 +80,7 @@ class Measure(object):
             self.start_context.__enter__()
             self.created_context = True
 
-        self.ru_utime, self.ru_stime = self.start_context.get_resource_usage()
-        self.db_txn_count = self.start_context.db_txn_count
-        self.db_txn_duration_sec = self.start_context.db_txn_duration_sec
-        self.db_sched_duration_sec = self.start_context.db_sched_duration_sec
+        self.start_usage = self.start_context.get_resource_usage()
 
     def __exit__(self, exc_type, exc_val, exc_tb):
         if isinstance(exc_type, Exception) or not self.start_context:
@@ -108,15 +104,12 @@ class Measure(object):
             logger.warn("Expected context. (%r)", self.name)
             return
 
-        ru_utime, ru_stime = context.get_resource_usage()
-
-        block_ru_utime.labels(self.name).inc(ru_utime - self.ru_utime)
-        block_ru_stime.labels(self.name).inc(ru_stime - self.ru_stime)
-        block_db_txn_count.labels(self.name).inc(context.db_txn_count - self.db_txn_count)
-        block_db_txn_duration.labels(self.name).inc(
-            context.db_txn_duration_sec - self.db_txn_duration_sec)
-        block_db_sched_duration.labels(self.name).inc(
-            context.db_sched_duration_sec - self.db_sched_duration_sec)
+        usage = context.get_resource_usage() - self.start_usage
+        block_ru_utime.labels(self.name).inc(usage.ru_utime)
+        block_ru_stime.labels(self.name).inc(usage.ru_stime)
+        block_db_txn_count.labels(self.name).inc(usage.db_txn_count)
+        block_db_txn_duration.labels(self.name).inc(usage.db_txn_duration_sec)
+        block_db_sched_duration.labels(self.name).inc(usage.db_sched_duration_sec)
 
         if self.created_context:
             self.start_context.__exit__(exc_type, exc_val, exc_tb)