Browse Source

Add information on uploaded media to user export command. (#15107)

Dirk Klimpel 1 year ago
parent
commit
a068ad7dd4

+ 1 - 0
changelog.d/15107.feature

@@ -0,0 +1 @@
+Add media information to the command line [user data export tool](https://matrix-org.github.io/synapse/v1.79/usage/administration/admin_faq.html#how-can-i-export-user-data).

+ 58 - 16
docs/usage/administration/admin_faq.md

@@ -70,13 +70,55 @@ output-directory
 │       ├───state
 │       ├───invite_state
 │       └───knock_state
-└───user_data
-    ├───account_data
-    │   ├───global
-    │   └───<room_id>
-    ├───connections
-    ├───devices
-    └───profile
+├───user_data
+│   ├───account_data
+│   │   ├───global
+│   │   └───<room_id>
+│   ├───connections
+│   ├───devices
+│   └───profile
+└───media_ids
+    └───<media_id>
+```
+
+The `media_ids` folder contains only the metadata of the media uploaded by the user.
+It does not contain the media itself.
+Furthermore, only the `media_ids` that Synapse manages itself are exported.
+If another media repository (e.g. [matrix-media-repo](https://github.com/turt2live/matrix-media-repo))
+is used, the data must be exported separately.
+
+With the `media_ids` the media files can be downloaded.
+Media that have been sent in encrypted rooms are only retrieved in encrypted form.
+The following script can help with download the media files:
+
+```bash
+#!/usr/bin/env bash
+
+# Parameters
+#
+#   source_directory: Directory which contains the export with the media_ids.
+#   target_directory: Directory into which all files are to be downloaded.
+#   repository_url: Address of the media repository resp. media worker.
+#   serverName: Name of the server (`server_name` from homeserver.yaml).
+#
+#   Example:
+#       ./download_media.sh /tmp/export_data/media_ids/ /tmp/export_data/media_files/ http://localhost:8008 matrix.example.com
+
+source_directory=$1
+target_directory=$2
+repository_url=$3
+serverName=$4
+
+mkdir -p $target_directory
+
+for file in $source_directory/*; do
+    filename=$(basename ${file})
+    url=$repository_url/_matrix/media/v3/download/$serverName/$filename
+    echo "Downloading $filename - $url"
+    if ! wget -o /dev/null -P $target_directory $url; then
+        echo "Could not download $filename"
+    fi
+done
 ```
 
 Manually resetting passwords
@@ -87,7 +129,7 @@ can reset a user's password using the [admin API](../../admin_api/user_admin_api
 
 I have a problem with my server. Can I just delete my database and start again?
 ---
-Deleting your database is unlikely to make anything better. 
+Deleting your database is unlikely to make anything better.
 
 It's easy to make the mistake of thinking that you can start again from a clean
 slate by dropping your database, but things don't work like that in a federated
@@ -102,7 +144,7 @@ Come and seek help in https://matrix.to/#/#synapse:matrix.org.
 
 There are two exceptions when it might be sensible to delete your database and start again:
 * You have *never* joined any rooms which are federated with other servers. For
-instance, a local deployment which the outside world can't talk to. 
+instance, a local deployment which the outside world can't talk to.
 * You are changing the `server_name` in the homeserver configuration. In effect
 this makes your server a completely new one from the point of view of the network,
 so in this case it makes sense to start with a clean database.
@@ -115,7 +157,7 @@ Using the following curl command:
 curl -H 'Authorization: Bearer <access-token>' -X DELETE https://matrix.org/_matrix/client/r0/directory/room/<room-alias>
 ```
 `<access-token>` - can be obtained in riot by looking in the riot settings, down the bottom is:
-Access Token:\<click to reveal\> 
+Access Token:\<click to reveal\>
 
 `<room-alias>` - the room alias, eg. #my_room:matrix.org this possibly needs to be URL encoded also, for example  %23my_room%3Amatrix.org
 
@@ -152,13 +194,13 @@ What are the biggest rooms on my server?
 ---
 
 ```sql
-SELECT s.canonical_alias, g.room_id, count(*) AS num_rows 
-FROM 
-  state_groups_state AS g, 
-  room_stats_state AS s 
-WHERE g.room_id = s.room_id 
+SELECT s.canonical_alias, g.room_id, count(*) AS num_rows
+FROM
+  state_groups_state AS g,
+  room_stats_state AS s
+WHERE g.room_id = s.room_id
 GROUP BY s.canonical_alias, g.room_id
-ORDER BY num_rows desc 
+ORDER BY num_rows desc
 LIMIT 10;
 ```
 

+ 10 - 0
synapse/app/admin_cmd.py

@@ -44,6 +44,7 @@ from synapse.storage.databases.main.event_push_actions import (
 )
 from synapse.storage.databases.main.events_worker import EventsWorkerStore
 from synapse.storage.databases.main.filtering import FilteringWorkerStore
+from synapse.storage.databases.main.media_repository import MediaRepositoryStore
 from synapse.storage.databases.main.profile import ProfileWorkerStore
 from synapse.storage.databases.main.push_rule import PushRulesWorkerStore
 from synapse.storage.databases.main.receipts import ReceiptsWorkerStore
@@ -86,6 +87,7 @@ class AdminCmdSlavedStore(
     RegistrationWorkerStore,
     RoomWorkerStore,
     ProfileWorkerStore,
+    MediaRepositoryStore,
 ):
     def __init__(
         self,
@@ -235,6 +237,14 @@ class FileExfiltrationWriter(ExfiltrationWriter):
         with open(account_data_file, "a") as f:
             json.dump(account_data, fp=f)
 
+    def write_media_id(self, media_id: str, media_metadata: JsonDict) -> None:
+        file_directory = os.path.join(self.base_directory, "media_ids")
+        os.makedirs(file_directory, exist_ok=True)
+        media_id_file = os.path.join(file_directory, media_id)
+
+        with open(media_id_file, "w") as f:
+            json.dump(media_metadata, fp=f)
+
     def finished(self) -> str:
         return self.base_directory
 

+ 38 - 0
synapse/handlers/admin.py

@@ -252,16 +252,19 @@ class AdminHandler:
         profile = await self.get_user(UserID.from_string(user_id))
         if profile is not None:
             writer.write_profile(profile)
+            logger.info("[%s] Written profile", user_id)
 
         # Get all devices the user has
         devices = await self._device_handler.get_devices_by_user(user_id)
         writer.write_devices(devices)
+        logger.info("[%s] Written %s devices", user_id, len(devices))
 
         # Get all connections the user has
         connections = await self.get_whois(UserID.from_string(user_id))
         writer.write_connections(
             connections["devices"][""]["sessions"][0]["connections"]
         )
+        logger.info("[%s] Written %s connections", user_id, len(connections))
 
         # Get all account data the user has global and in rooms
         global_data = await self._store.get_global_account_data_for_user(user_id)
@@ -269,6 +272,29 @@ class AdminHandler:
         writer.write_account_data("global", global_data)
         for room_id in by_room_data:
             writer.write_account_data(room_id, by_room_data[room_id])
+        logger.info(
+            "[%s] Written account data for %s rooms", user_id, len(by_room_data)
+        )
+
+        # Get all media ids the user has
+        limit = 100
+        start = 0
+        while True:
+            media_ids, total = await self._store.get_local_media_by_user_paginate(
+                start, limit, user_id
+            )
+            for media in media_ids:
+                writer.write_media_id(media["media_id"], media)
+
+            logger.info(
+                "[%s] Written %d media_ids of %s",
+                user_id,
+                (start + len(media_ids)),
+                total,
+            )
+            if (start + limit) >= total:
+                break
+            start += limit
 
         return writer.finished()
 
@@ -359,6 +385,18 @@ class ExfiltrationWriter(metaclass=abc.ABCMeta):
         """
         raise NotImplementedError()
 
+    @abc.abstractmethod
+    def write_media_id(self, media_id: str, media_metadata: JsonDict) -> None:
+        """Write the media's metadata of a user.
+        Exports only the metadata, as this can be fetched from the database via
+        read only. In order to access the files, a connection to the correct
+        media repository would be required.
+
+        Args:
+            media_id: ID of the media.
+            media_metadata: Metadata of one media file.
+        """
+
     @abc.abstractmethod
     def finished(self) -> Any:
         """Called when all data has successfully been exported and written.

+ 29 - 0
tests/handlers/test_admin.py

@@ -23,6 +23,7 @@ from synapse.api.constants import EventTypes, JoinRules
 from synapse.api.room_versions import RoomVersions
 from synapse.rest.client import knock, login, room
 from synapse.server import HomeServer
+from synapse.types import UserID
 from synapse.util import Clock
 
 from tests import unittest
@@ -323,3 +324,31 @@ class ExfiltrateData(unittest.HomeserverTestCase):
         args = writer.write_account_data.call_args_list[1][0]
         self.assertEqual(args[0], "test_room")
         self.assertEqual(args[1]["m.per_room"]["b"], 2)
+
+    def test_media_ids(self) -> None:
+        """Tests that media's metadata get exported."""
+
+        self.get_success(
+            self._store.store_local_media(
+                media_id="media_1",
+                media_type="image/png",
+                time_now_ms=self.clock.time_msec(),
+                upload_name=None,
+                media_length=50,
+                user_id=UserID.from_string(self.user2),
+            )
+        )
+
+        writer = Mock()
+
+        self.get_success(self.admin_handler.export_user_data(self.user2, writer))
+
+        writer.write_media_id.assert_called_once()
+
+        args = writer.write_media_id.call_args[0]
+        self.assertEqual(args[0], "media_1")
+        self.assertEqual(args[1]["media_id"], "media_1")
+        self.assertEqual(args[1]["media_length"], 50)
+        self.assertGreater(args[1]["created_ts"], 0)
+        self.assertIsNone(args[1]["upload_name"])
+        self.assertIsNone(args[1]["last_access_ts"])