Browse Source

Avoid incrementing bg process utime/stime counters by negative durations (#14323)

David Robertson 1 year ago
parent
commit
2bb2c32e8e

+ 1 - 0
changelog.d/14323.bugfix

@@ -0,0 +1 @@
+Fix a bug introduced in Synapse 0.34.0rc2 where logs could include error spam when background processes are measured as taking a negative amount of time.

+ 3 - 1
mypy.ini

@@ -56,7 +56,6 @@ exclude = (?x)
    |tests/rest/media/v1/test_media_storage.py
    |tests/server.py
    |tests/server_notices/test_resource_limits_server_notices.py
-   |tests/test_metrics.py
    |tests/test_state.py
    |tests/test_terms_auth.py
    |tests/util/caches/test_cached_call.py
@@ -106,6 +105,9 @@ disallow_untyped_defs = False
 [mypy-tests.handlers.test_user_directory]
 disallow_untyped_defs = True
 
+[mypy-tests.metrics.test_background_process_metrics]
+disallow_untyped_defs = True
+
 [mypy-tests.push.test_bulk_push_rule_evaluator]
 disallow_untyped_defs = True
 

+ 4 - 2
synapse/metrics/background_process_metrics.py

@@ -174,8 +174,10 @@ class _BackgroundProcess:
             diff = new_stats - self._reported_stats
         self._reported_stats = new_stats
 
-        _background_process_ru_utime.labels(self.desc).inc(diff.ru_utime)
-        _background_process_ru_stime.labels(self.desc).inc(diff.ru_stime)
+        # For unknown reasons, the difference in times can be negative. See comment in
+        # synapse.http.request_metrics.RequestMetrics.update_metrics.
+        _background_process_ru_utime.labels(self.desc).inc(max(diff.ru_utime, 0))
+        _background_process_ru_stime.labels(self.desc).inc(max(diff.ru_stime, 0))
         _background_process_db_txn_count.labels(self.desc).inc(diff.db_txn_count)
         _background_process_db_txn_duration.labels(self.desc).inc(
             diff.db_txn_duration_sec

+ 0 - 0
tests/metrics/__init__.py


+ 19 - 0
tests/metrics/test_background_process_metrics.py

@@ -0,0 +1,19 @@
+from unittest import TestCase as StdlibTestCase
+from unittest.mock import Mock
+
+from synapse.logging.context import ContextResourceUsage, LoggingContext
+from synapse.metrics.background_process_metrics import _BackgroundProcess
+
+
+class TestBackgroundProcessMetrics(StdlibTestCase):
+    def test_update_metrics_with_negative_time_diff(self) -> None:
+        """We should ignore negative reported utime and stime differences"""
+        usage = ContextResourceUsage()
+        usage.ru_stime = usage.ru_utime = -1.0
+
+        mock_logging_context = Mock(spec=LoggingContext)
+        mock_logging_context.get_resource_usage.return_value = usage
+
+        process = _BackgroundProcess("test process", mock_logging_context)
+        # Should not raise
+        process.update_metrics()

+ 8 - 2
tests/test_metrics.py → tests/metrics/test_metrics.py

@@ -12,6 +12,8 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+from typing_extensions import Protocol
+
 try:
     from importlib import metadata
 except ImportError:
@@ -52,7 +54,11 @@ def get_sample_labels_value(sample):
 
 class TestMauLimit(unittest.TestCase):
     def test_basic(self):
-        gauge = InFlightGauge(
+        class MetricEntry(Protocol):
+            foo: int
+            bar: int
+
+        gauge: InFlightGauge[MetricEntry] = InFlightGauge(
             "test1", "", labels=["test_label"], sub_metrics=["foo", "bar"]
         )
 
@@ -146,7 +152,7 @@ class CacheMetricsTests(unittest.HomeserverTestCase):
         Caches produce metrics reflecting their state when scraped.
         """
         CACHE_NAME = "cache_metrics_test_fgjkbdfg"
-        cache = DeferredCache(CACHE_NAME, max_entries=777)
+        cache: DeferredCache[str, str] = DeferredCache(CACHE_NAME, max_entries=777)
 
         items = {
             x.split(b"{")[0].decode("ascii"): x.split(b" ")[1].decode("ascii")