Browse Source

Don't reset retry timers on "valid" error codes (#16221)

Erik Johnston 9 tháng trước cách đây
mục cha
commit
f84baecb6f

+ 1 - 0
changelog.d/16221.bugfix

@@ -0,0 +1 @@
+Fix long-standing bug where we did not correctly back off from servers that had "gone" if they returned 4xx series error codes.

+ 3 - 1
synapse/federation/transport/client.py

@@ -249,8 +249,10 @@ class TransportLayerClient:
             data=json_data,
             json_data_callback=json_data_callback,
             long_retries=True,
-            backoff_on_404=True,  # If we get a 404 the other side has gone
             try_trailing_slash_on_400=True,
+            # Sending a transaction should always succeed, if it doesn't
+            # then something is wrong and we should backoff.
+            backoff_on_all_error_codes=True,
         )
 
     async def make_query(

+ 8 - 0
synapse/http/matrixfederationclient.py

@@ -512,6 +512,7 @@ class MatrixFederationHttpClient:
         long_retries: bool = False,
         ignore_backoff: bool = False,
         backoff_on_404: bool = False,
+        backoff_on_all_error_codes: bool = False,
     ) -> IResponse:
         """
         Sends a request to the given server.
@@ -552,6 +553,7 @@ class MatrixFederationHttpClient:
                 and try the request anyway.
 
             backoff_on_404: Back off if we get a 404
+            backoff_on_all_error_codes: Back off if we get any error response
 
         Returns:
             Resolves with the HTTP response object on success.
@@ -594,6 +596,7 @@ class MatrixFederationHttpClient:
             ignore_backoff=ignore_backoff,
             notifier=self.hs.get_notifier(),
             replication_client=self.hs.get_replication_command_handler(),
+            backoff_on_all_error_codes=backoff_on_all_error_codes,
         )
 
         method_bytes = request.method.encode("ascii")
@@ -889,6 +892,7 @@ class MatrixFederationHttpClient:
         backoff_on_404: bool = False,
         try_trailing_slash_on_400: bool = False,
         parser: Literal[None] = None,
+        backoff_on_all_error_codes: bool = False,
     ) -> JsonDict:
         ...
 
@@ -906,6 +910,7 @@ class MatrixFederationHttpClient:
         backoff_on_404: bool = False,
         try_trailing_slash_on_400: bool = False,
         parser: Optional[ByteParser[T]] = None,
+        backoff_on_all_error_codes: bool = False,
     ) -> T:
         ...
 
@@ -922,6 +927,7 @@ class MatrixFederationHttpClient:
         backoff_on_404: bool = False,
         try_trailing_slash_on_400: bool = False,
         parser: Optional[ByteParser[T]] = None,
+        backoff_on_all_error_codes: bool = False,
     ) -> Union[JsonDict, T]:
         """Sends the specified json data using PUT
 
@@ -957,6 +963,7 @@ class MatrixFederationHttpClient:
                 enabled.
             parser: The parser to use to decode the response. Defaults to
                 parsing as JSON.
+            backoff_on_all_error_codes: Back off if we get any error response
 
         Returns:
             Succeeds when we get a 2xx HTTP response. The
@@ -990,6 +997,7 @@ class MatrixFederationHttpClient:
             ignore_backoff=ignore_backoff,
             long_retries=long_retries,
             timeout=timeout,
+            backoff_on_all_error_codes=backoff_on_all_error_codes,
         )
 
         if timeout is not None:

+ 16 - 2
synapse/util/retryutils.py

@@ -128,6 +128,7 @@ class RetryDestinationLimiter:
         backoff_on_failure: bool = True,
         notifier: Optional["Notifier"] = None,
         replication_client: Optional["ReplicationCommandHandler"] = None,
+        backoff_on_all_error_codes: bool = False,
     ):
         """Marks the destination as "down" if an exception is thrown in the
         context, except for CodeMessageException with code < 500.
@@ -147,6 +148,9 @@ class RetryDestinationLimiter:
 
             backoff_on_failure: set to False if we should not increase the
                 retry interval on a failure.
+
+            backoff_on_all_error_codes: Whether we should back off on any
+                error code.
         """
         self.clock = clock
         self.store = store
@@ -156,6 +160,7 @@ class RetryDestinationLimiter:
         self.retry_interval = retry_interval
         self.backoff_on_404 = backoff_on_404
         self.backoff_on_failure = backoff_on_failure
+        self.backoff_on_all_error_codes = backoff_on_all_error_codes
 
         self.notifier = notifier
         self.replication_client = replication_client
@@ -179,6 +184,7 @@ class RetryDestinationLimiter:
         exc_val: Optional[BaseException],
         exc_tb: Optional[TracebackType],
     ) -> None:
+        success = exc_type is None
         valid_err_code = False
         if exc_type is None:
             valid_err_code = True
@@ -195,7 +201,9 @@ class RetryDestinationLimiter:
             # won't accept our requests for at least a while.
             # 429 is us being aggressively rate limited, so lets rate limit
             # ourselves.
-            if exc_val.code == 404 and self.backoff_on_404:
+            if self.backoff_on_all_error_codes:
+                valid_err_code = False
+            elif exc_val.code == 404 and self.backoff_on_404:
                 valid_err_code = False
             elif exc_val.code in (401, 429):
                 valid_err_code = False
@@ -204,7 +212,7 @@ class RetryDestinationLimiter:
             else:
                 valid_err_code = False
 
-        if valid_err_code:
+        if success:
             # We connected successfully.
             if not self.retry_interval:
                 return
@@ -215,6 +223,12 @@ class RetryDestinationLimiter:
             self.failure_ts = None
             retry_last_ts = 0
             self.retry_interval = 0
+        elif valid_err_code:
+            # We got a potentially valid error code back. We don't reset the
+            # timers though, as the other side might actually be down anyway
+            # (e.g. some deprovisioned servers will always return a 404 or 403,
+            # and we don't want to keep resetting the retry timers for them).
+            return
         elif not self.backoff_on_failure:
             return
         else:

+ 2 - 2
tests/handlers/test_typing.py

@@ -251,8 +251,8 @@ class TypingNotificationsTestCase(unittest.HomeserverTestCase):
             ),
             json_data_callback=ANY,
             long_retries=True,
-            backoff_on_404=True,
             try_trailing_slash_on_400=True,
+            backoff_on_all_error_codes=True,
         )
 
     def test_started_typing_remote_recv(self) -> None:
@@ -366,7 +366,7 @@ class TypingNotificationsTestCase(unittest.HomeserverTestCase):
             ),
             json_data_callback=ANY,
             long_retries=True,
-            backoff_on_404=True,
+            backoff_on_all_error_codes=True,
             try_trailing_slash_on_400=True,
         )