Forráskód Böngészése

Delete stale non-e2e devices for users, take 3 (#15183)

This should help reduce the number of devices e.g. simple bots the repeatedly login rack up.

We only delete non-e2e devices as they should be safe to delete, whereas if we delete e2e devices for a user we may accidentally break their ability to receive e2e keys for a message.
Erik Johnston 1 éve
szülő
commit
78cdb72cd6

+ 1 - 0
changelog.d/15183.misc

@@ -0,0 +1 @@
+Prune user's old devices on login if they have too many.

+ 1 - 1
synapse/handlers/device.py

@@ -485,7 +485,7 @@ class DeviceHandler(DeviceWorkerHandler):
             device_ids = [d for d in device_ids if d != except_device_id]
         await self.delete_devices(user_id, device_ids)
 
-    async def delete_devices(self, user_id: str, device_ids: List[str]) -> None:
+    async def delete_devices(self, user_id: str, device_ids: StrCollection) -> None:
         """Delete several devices
 
         Args:

+ 48 - 2
synapse/handlers/register.py

@@ -16,7 +16,7 @@
 """Contains functions for registering clients."""
 
 import logging
-from typing import TYPE_CHECKING, Iterable, List, Optional, Tuple
+from typing import TYPE_CHECKING, Iterable, List, Optional, Set, Tuple
 
 from prometheus_client import Counter
 from typing_extensions import TypedDict
@@ -40,6 +40,7 @@ from synapse.appservice import ApplicationService
 from synapse.config.server import is_threepid_reserved
 from synapse.handlers.device import DeviceHandler
 from synapse.http.servlet import assert_params_in_dict
+from synapse.metrics.background_process_metrics import run_as_background_process
 from synapse.replication.http.login import RegisterDeviceReplicationServlet
 from synapse.replication.http.register import (
     ReplicationPostRegisterActionsServlet,
@@ -48,6 +49,7 @@ from synapse.replication.http.register import (
 from synapse.spam_checker_api import RegistrationBehaviour
 from synapse.types import RoomAlias, UserID, create_requester
 from synapse.types.state import StateFilter
+from synapse.util.iterutils import batch_iter
 
 if TYPE_CHECKING:
     from synapse.server import HomeServer
@@ -110,6 +112,10 @@ class RegistrationHandler:
         self._server_notices_mxid = hs.config.servernotices.server_notices_mxid
         self._server_name = hs.hostname
 
+        # The set of users that we're currently pruning devices for. Ensures
+        # that we don't have two such jobs for the same user running at once.
+        self._currently_pruning_devices_for_users: Set[str] = set()
+
         self.spam_checker = hs.get_spam_checker()
 
         if hs.config.worker.worker_app:
@@ -121,7 +127,10 @@ class RegistrationHandler:
                 ReplicationPostRegisterActionsServlet.make_client(hs)
             )
         else:
-            self.device_handler = hs.get_device_handler()
+            device_handler = hs.get_device_handler()
+            assert isinstance(device_handler, DeviceHandler)
+            self.device_handler = device_handler
+
             self._register_device_client = self.register_device_inner
             self.pusher_pool = hs.get_pusherpool()
 
@@ -851,6 +860,9 @@ class RegistrationHandler:
         # This can only run on the main process.
         assert isinstance(self.device_handler, DeviceHandler)
 
+        # Prune the user's device list if they already have a lot of devices.
+        await self._maybe_prune_too_many_devices(user_id)
+
         registered_device_id = await self.device_handler.check_device_registered(
             user_id,
             device_id,
@@ -919,6 +931,40 @@ class RegistrationHandler:
             "refresh_token": refresh_token,
         }
 
+    async def _maybe_prune_too_many_devices(self, user_id: str) -> None:
+        """Delete any excess old devices this user may have."""
+
+        if user_id in self._currently_pruning_devices_for_users:
+            return
+
+        # We also cap the number of users whose devices we prune at the same
+        # time, to avoid performance problems.
+        if len(self._currently_pruning_devices_for_users) > 5:
+            return
+
+        device_ids = await self.store.check_too_many_devices_for_user(user_id)
+        if not device_ids:
+            return
+
+        # Now spawn a background loop that deletes said devices.
+        async def _prune_too_many_devices_loop() -> None:
+            if user_id in self._currently_pruning_devices_for_users:
+                return
+
+            self._currently_pruning_devices_for_users.add(user_id)
+
+            try:
+                for batch in batch_iter(device_ids, 10):
+                    await self.device_handler.delete_devices(user_id, batch)
+
+                    await self.clock.sleep(60)
+            finally:
+                self._currently_pruning_devices_for_users.discard(user_id)
+
+        run_as_background_process(
+            "_prune_too_many_devices_loop", _prune_too_many_devices_loop
+        )
+
     async def post_registration_actions(
         self, user_id: str, auth_result: dict, access_token: Optional[str]
     ) -> None:

+ 79 - 1
synapse/storage/databases/main/devices.py

@@ -1599,6 +1599,73 @@ class DeviceBackgroundUpdateStore(SQLBaseStore):
 
         return rows
 
+    async def check_too_many_devices_for_user(self, user_id: str) -> List[str]:
+        """Check if the user has a lot of devices, and if so return the set of
+        devices we can prune.
+
+        This does *not* return hidden devices or devices with E2E keys.
+        """
+
+        num_devices = await self.db_pool.simple_select_one_onecol(
+            table="devices",
+            keyvalues={"user_id": user_id, "hidden": False},
+            retcol="COALESCE(COUNT(*), 0)",
+            desc="count_devices",
+        )
+
+        # We let users have up to ten devices without pruning.
+        if num_devices <= 10:
+            return []
+
+        # We always prune devices not seen in the last 14 days...
+        max_last_seen = self._clock.time_msec() - 14 * 24 * 60 * 60 * 1000
+
+        # ... but we also cap the maximum number of devices the user can have to
+        # 50.
+        if num_devices > 50:
+            # Choose a last seen that ensures we keep at most 50 devices.
+            sql = """
+                SELECT last_seen FROM devices
+                LEFT JOIN e2e_device_keys_json USING (user_id, device_id)
+                WHERE
+                    user_id = ?
+                    AND NOT hidden
+                    AND last_seen IS NOT NULL
+                    AND key_json IS NULL
+                ORDER BY last_seen DESC
+                LIMIT 1
+                OFFSET 50
+            """
+
+            rows = await self.db_pool.execute(
+                "check_too_many_devices_for_user_last_seen", None, sql, (user_id,)
+            )
+            if rows:
+                max_last_seen = max(rows[0][0], max_last_seen)
+
+        # Fetch the devices to delete.
+        sql = """
+            SELECT DISTINCT device_id FROM devices
+            LEFT JOIN e2e_device_keys_json USING (user_id, device_id)
+            WHERE
+                user_id = ?
+                AND NOT hidden
+                AND last_seen < ?
+                AND key_json IS NULL
+            ORDER BY last_seen
+        """
+
+        def check_too_many_devices_for_user_txn(
+            txn: LoggingTransaction,
+        ) -> List[str]:
+            txn.execute(sql, (user_id, max_last_seen))
+            return [device_id for device_id, in txn]
+
+        return await self.db_pool.runInteraction(
+            "check_too_many_devices_for_user",
+            check_too_many_devices_for_user_txn,
+        )
+
 
 class DeviceStore(DeviceWorkerStore, DeviceBackgroundUpdateStore):
     # Because we have write access, this will be a StreamIdGenerator
@@ -1657,6 +1724,7 @@ class DeviceStore(DeviceWorkerStore, DeviceBackgroundUpdateStore):
                 values={},
                 insertion_values={
                     "display_name": initial_device_display_name,
+                    "last_seen": self._clock.time_msec(),
                     "hidden": False,
                 },
                 desc="store_device",
@@ -1702,7 +1770,15 @@ class DeviceStore(DeviceWorkerStore, DeviceBackgroundUpdateStore):
             )
             raise StoreError(500, "Problem storing device.")
 
-    async def delete_devices(self, user_id: str, device_ids: List[str]) -> None:
+    @cached(max_entries=0)
+    async def delete_device(self, user_id: str, device_id: str) -> None:
+        raise NotImplementedError()
+
+    # Note: sometimes deleting rows out of `device_inbox` can take a long time,
+    # so we use a cache so that we deduplicate in flight requests to delete
+    # devices.
+    @cachedList(cached_method_name="delete_device", list_name="device_ids")
+    async def delete_devices(self, user_id: str, device_ids: Collection[str]) -> dict:
         """Deletes several devices.
 
         Args:
@@ -1739,6 +1815,8 @@ class DeviceStore(DeviceWorkerStore, DeviceBackgroundUpdateStore):
         for device_id in device_ids:
             self.device_id_exists_cache.invalidate((user_id, device_id))
 
+        return {}
+
     async def update_device(
         self, user_id: str, device_id: str, new_display_name: Optional[str] = None
     ) -> None:

+ 1 - 1
tests/handlers/test_admin.py

@@ -272,7 +272,7 @@ class ExfiltrateData(unittest.HomeserverTestCase):
         self.assertIn("device_id", args[0][0])
         self.assertIsNone(args[0][0]["display_name"])
         self.assertIsNone(args[0][0]["last_seen_user_agent"])
-        self.assertIsNone(args[0][0]["last_seen_ts"])
+        self.assertEqual(args[0][0]["last_seen_ts"], 600)
         self.assertIsNone(args[0][0]["last_seen_ip"])
 
     def test_connections(self) -> None:

+ 1 - 1
tests/handlers/test_device.py

@@ -115,7 +115,7 @@ class DeviceTestCase(unittest.HomeserverTestCase):
                 "device_id": "xyz",
                 "display_name": "display 0",
                 "last_seen_ip": None,
-                "last_seen_ts": None,
+                "last_seen_ts": 1000000,
             },
             device_map["xyz"],
         )

+ 3 - 1
tests/storage/test_client_ips.py

@@ -170,6 +170,8 @@ class ClientIpStoreTestCase(unittest.HomeserverTestCase):
             )
         )
 
+        last_seen = self.clock.time_msec()
+
         if after_persisting:
             # Trigger the storage loop
             self.reactor.advance(10)
@@ -190,7 +192,7 @@ class ClientIpStoreTestCase(unittest.HomeserverTestCase):
                         "device_id": device_id,
                         "ip": None,
                         "user_agent": None,
-                        "last_seen": None,
+                        "last_seen": last_seen,
                     },
                 ],
             )