test_media_storage.py 21 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620
  1. # Copyright 2018-2021 The Matrix.org Foundation C.I.C.
  2. #
  3. # Licensed under the Apache License, Version 2.0 (the "License");
  4. # you may not use this file except in compliance with the License.
  5. # You may obtain a copy of the License at
  6. #
  7. # http://www.apache.org/licenses/LICENSE-2.0
  8. #
  9. # Unless required by applicable law or agreed to in writing, software
  10. # distributed under the License is distributed on an "AS IS" BASIS,
  11. # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  12. # See the License for the specific language governing permissions and
  13. # limitations under the License.
  14. import os
  15. import shutil
  16. import tempfile
  17. from binascii import unhexlify
  18. from io import BytesIO
  19. from typing import Optional
  20. from unittest.mock import Mock
  21. from urllib import parse
  22. import attr
  23. from parameterized import parameterized, parameterized_class
  24. from PIL import Image as Image
  25. from twisted.internet import defer
  26. from twisted.internet.defer import Deferred
  27. from synapse.events.spamcheck import load_legacy_spam_checkers
  28. from synapse.logging.context import make_deferred_yieldable
  29. from synapse.rest import admin
  30. from synapse.rest.client import login
  31. from synapse.rest.media.v1._base import FileInfo
  32. from synapse.rest.media.v1.filepath import MediaFilePaths
  33. from synapse.rest.media.v1.media_storage import MediaStorage
  34. from synapse.rest.media.v1.storage_provider import FileStorageProviderBackend
  35. from tests import unittest
  36. from tests.server import FakeSite, make_request
  37. from tests.test_utils import SMALL_PNG
  38. from tests.utils import default_config
  39. class MediaStorageTests(unittest.HomeserverTestCase):
  40. needs_threadpool = True
  41. def prepare(self, reactor, clock, hs):
  42. self.test_dir = tempfile.mkdtemp(prefix="synapse-tests-")
  43. self.addCleanup(shutil.rmtree, self.test_dir)
  44. self.primary_base_path = os.path.join(self.test_dir, "primary")
  45. self.secondary_base_path = os.path.join(self.test_dir, "secondary")
  46. hs.config.media.media_store_path = self.primary_base_path
  47. storage_providers = [FileStorageProviderBackend(hs, self.secondary_base_path)]
  48. self.filepaths = MediaFilePaths(self.primary_base_path)
  49. self.media_storage = MediaStorage(
  50. hs, self.primary_base_path, self.filepaths, storage_providers
  51. )
  52. def test_ensure_media_is_in_local_cache(self):
  53. media_id = "some_media_id"
  54. test_body = "Test\n"
  55. # First we create a file that is in a storage provider but not in the
  56. # local primary media store
  57. rel_path = self.filepaths.local_media_filepath_rel(media_id)
  58. secondary_path = os.path.join(self.secondary_base_path, rel_path)
  59. os.makedirs(os.path.dirname(secondary_path))
  60. with open(secondary_path, "w") as f:
  61. f.write(test_body)
  62. # Now we run ensure_media_is_in_local_cache, which should copy the file
  63. # to the local cache.
  64. file_info = FileInfo(None, media_id)
  65. # This uses a real blocking threadpool so we have to wait for it to be
  66. # actually done :/
  67. x = defer.ensureDeferred(
  68. self.media_storage.ensure_media_is_in_local_cache(file_info)
  69. )
  70. # Hotloop until the threadpool does its job...
  71. self.wait_on_thread(x)
  72. local_path = self.get_success(x)
  73. self.assertTrue(os.path.exists(local_path))
  74. # Asserts the file is under the expected local cache directory
  75. self.assertEquals(
  76. os.path.commonprefix([self.primary_base_path, local_path]),
  77. self.primary_base_path,
  78. )
  79. with open(local_path) as f:
  80. body = f.read()
  81. self.assertEqual(test_body, body)
  82. @attr.s(slots=True, frozen=True)
  83. class _TestImage:
  84. """An image for testing thumbnailing with the expected results
  85. Attributes:
  86. data: The raw image to thumbnail
  87. content_type: The type of the image as a content type, e.g. "image/png"
  88. extension: The extension associated with the format, e.g. ".png"
  89. expected_cropped: The expected bytes from cropped thumbnailing, or None if
  90. test should just check for success.
  91. expected_scaled: The expected bytes from scaled thumbnailing, or None if
  92. test should just check for a valid image returned.
  93. expected_found: True if the file should exist on the server, or False if
  94. a 404 is expected.
  95. """
  96. data = attr.ib(type=bytes)
  97. content_type = attr.ib(type=bytes)
  98. extension = attr.ib(type=bytes)
  99. expected_cropped = attr.ib(type=Optional[bytes], default=None)
  100. expected_scaled = attr.ib(type=Optional[bytes], default=None)
  101. expected_found = attr.ib(default=True, type=bool)
  102. @parameterized_class(
  103. ("test_image",),
  104. [
  105. # smoll png
  106. (
  107. _TestImage(
  108. SMALL_PNG,
  109. b"image/png",
  110. b".png",
  111. unhexlify(
  112. b"89504e470d0a1a0a0000000d4948445200000020000000200806"
  113. b"000000737a7af40000001a49444154789cedc101010000008220"
  114. b"ffaf6e484001000000ef0610200001194334ee0000000049454e"
  115. b"44ae426082"
  116. ),
  117. unhexlify(
  118. b"89504e470d0a1a0a0000000d4948445200000001000000010806"
  119. b"0000001f15c4890000000d49444154789c636060606000000005"
  120. b"0001a5f645400000000049454e44ae426082"
  121. ),
  122. ),
  123. ),
  124. # small png with transparency.
  125. (
  126. _TestImage(
  127. unhexlify(
  128. b"89504e470d0a1a0a0000000d49484452000000010000000101000"
  129. b"00000376ef9240000000274524e5300010194fdae0000000a4944"
  130. b"4154789c636800000082008177cd72b60000000049454e44ae426"
  131. b"082"
  132. ),
  133. b"image/png",
  134. b".png",
  135. # Note that we don't check the output since it varies across
  136. # different versions of Pillow.
  137. ),
  138. ),
  139. # small lossless webp
  140. (
  141. _TestImage(
  142. unhexlify(
  143. b"524946461a000000574542505650384c0d0000002f0000001007"
  144. b"1011118888fe0700"
  145. ),
  146. b"image/webp",
  147. b".webp",
  148. ),
  149. ),
  150. # an empty file
  151. (
  152. _TestImage(
  153. b"",
  154. b"image/gif",
  155. b".gif",
  156. expected_found=False,
  157. ),
  158. ),
  159. ],
  160. )
  161. class MediaRepoTests(unittest.HomeserverTestCase):
  162. hijack_auth = True
  163. user_id = "@test:user"
  164. def make_homeserver(self, reactor, clock):
  165. self.fetches = []
  166. def get_file(destination, path, output_stream, args=None, max_size=None):
  167. """
  168. Returns tuple[int,dict,str,int] of file length, response headers,
  169. absolute URI, and response code.
  170. """
  171. def write_to(r):
  172. data, response = r
  173. output_stream.write(data)
  174. return response
  175. d = Deferred()
  176. d.addCallback(write_to)
  177. self.fetches.append((d, destination, path, args))
  178. return make_deferred_yieldable(d)
  179. client = Mock()
  180. client.get_file = get_file
  181. self.storage_path = self.mktemp()
  182. self.media_store_path = self.mktemp()
  183. os.mkdir(self.storage_path)
  184. os.mkdir(self.media_store_path)
  185. config = self.default_config()
  186. config["media_store_path"] = self.media_store_path
  187. config["max_image_pixels"] = 2000000
  188. provider_config = {
  189. "module": "synapse.rest.media.v1.storage_provider.FileStorageProviderBackend",
  190. "store_local": True,
  191. "store_synchronous": False,
  192. "store_remote": True,
  193. "config": {"directory": self.storage_path},
  194. }
  195. config["media_storage_providers"] = [provider_config]
  196. hs = self.setup_test_homeserver(config=config, federation_http_client=client)
  197. return hs
  198. def prepare(self, reactor, clock, hs):
  199. media_resource = hs.get_media_repository_resource()
  200. self.download_resource = media_resource.children[b"download"]
  201. self.thumbnail_resource = media_resource.children[b"thumbnail"]
  202. self.store = hs.get_datastore()
  203. self.media_repo = hs.get_media_repository()
  204. self.media_id = "example.com/12345"
  205. def _req(self, content_disposition, include_content_type=True):
  206. channel = make_request(
  207. self.reactor,
  208. FakeSite(self.download_resource, self.reactor),
  209. "GET",
  210. self.media_id,
  211. shorthand=False,
  212. await_result=False,
  213. )
  214. self.pump()
  215. # We've made one fetch, to example.com, using the media URL, and asking
  216. # the other server not to do a remote fetch
  217. self.assertEqual(len(self.fetches), 1)
  218. self.assertEqual(self.fetches[0][1], "example.com")
  219. self.assertEqual(
  220. self.fetches[0][2], "/_matrix/media/r0/download/" + self.media_id
  221. )
  222. self.assertEqual(self.fetches[0][3], {"allow_remote": "false"})
  223. headers = {
  224. b"Content-Length": [b"%d" % (len(self.test_image.data))],
  225. }
  226. if include_content_type:
  227. headers[b"Content-Type"] = [self.test_image.content_type]
  228. if content_disposition:
  229. headers[b"Content-Disposition"] = [content_disposition]
  230. self.fetches[0][0].callback(
  231. (self.test_image.data, (len(self.test_image.data), headers))
  232. )
  233. self.pump()
  234. self.assertEqual(channel.code, 200)
  235. return channel
  236. def test_handle_missing_content_type(self):
  237. channel = self._req(
  238. b"inline; filename=out" + self.test_image.extension,
  239. include_content_type=False,
  240. )
  241. headers = channel.headers
  242. self.assertEqual(channel.code, 200)
  243. self.assertEqual(
  244. headers.getRawHeaders(b"Content-Type"), [b"application/octet-stream"]
  245. )
  246. def test_disposition_filename_ascii(self):
  247. """
  248. If the filename is filename=<ascii> then Synapse will decode it as an
  249. ASCII string, and use filename= in the response.
  250. """
  251. channel = self._req(b"inline; filename=out" + self.test_image.extension)
  252. headers = channel.headers
  253. self.assertEqual(
  254. headers.getRawHeaders(b"Content-Type"), [self.test_image.content_type]
  255. )
  256. self.assertEqual(
  257. headers.getRawHeaders(b"Content-Disposition"),
  258. [b"inline; filename=out" + self.test_image.extension],
  259. )
  260. def test_disposition_filenamestar_utf8escaped(self):
  261. """
  262. If the filename is filename=*utf8''<utf8 escaped> then Synapse will
  263. correctly decode it as the UTF-8 string, and use filename* in the
  264. response.
  265. """
  266. filename = parse.quote("\u2603".encode()).encode("ascii")
  267. channel = self._req(
  268. b"inline; filename*=utf-8''" + filename + self.test_image.extension
  269. )
  270. headers = channel.headers
  271. self.assertEqual(
  272. headers.getRawHeaders(b"Content-Type"), [self.test_image.content_type]
  273. )
  274. self.assertEqual(
  275. headers.getRawHeaders(b"Content-Disposition"),
  276. [b"inline; filename*=utf-8''" + filename + self.test_image.extension],
  277. )
  278. def test_disposition_none(self):
  279. """
  280. If there is no filename, one isn't passed on in the Content-Disposition
  281. of the request.
  282. """
  283. channel = self._req(None)
  284. headers = channel.headers
  285. self.assertEqual(
  286. headers.getRawHeaders(b"Content-Type"), [self.test_image.content_type]
  287. )
  288. self.assertEqual(headers.getRawHeaders(b"Content-Disposition"), None)
  289. def test_thumbnail_crop(self):
  290. """Test that a cropped remote thumbnail is available."""
  291. self._test_thumbnail(
  292. "crop", self.test_image.expected_cropped, self.test_image.expected_found
  293. )
  294. def test_thumbnail_scale(self):
  295. """Test that a scaled remote thumbnail is available."""
  296. self._test_thumbnail(
  297. "scale", self.test_image.expected_scaled, self.test_image.expected_found
  298. )
  299. def test_invalid_type(self):
  300. """An invalid thumbnail type is never available."""
  301. self._test_thumbnail("invalid", None, False)
  302. @unittest.override_config(
  303. {"thumbnail_sizes": [{"width": 32, "height": 32, "method": "scale"}]}
  304. )
  305. def test_no_thumbnail_crop(self):
  306. """
  307. Override the config to generate only scaled thumbnails, but request a cropped one.
  308. """
  309. self._test_thumbnail("crop", None, False)
  310. @unittest.override_config(
  311. {"thumbnail_sizes": [{"width": 32, "height": 32, "method": "crop"}]}
  312. )
  313. def test_no_thumbnail_scale(self):
  314. """
  315. Override the config to generate only cropped thumbnails, but request a scaled one.
  316. """
  317. self._test_thumbnail("scale", None, False)
  318. def test_thumbnail_repeated_thumbnail(self):
  319. """Test that fetching the same thumbnail works, and deleting the on disk
  320. thumbnail regenerates it.
  321. """
  322. self._test_thumbnail(
  323. "scale", self.test_image.expected_scaled, self.test_image.expected_found
  324. )
  325. if not self.test_image.expected_found:
  326. return
  327. # Fetching again should work, without re-requesting the image from the
  328. # remote.
  329. params = "?width=32&height=32&method=scale"
  330. channel = make_request(
  331. self.reactor,
  332. FakeSite(self.thumbnail_resource, self.reactor),
  333. "GET",
  334. self.media_id + params,
  335. shorthand=False,
  336. await_result=False,
  337. )
  338. self.pump()
  339. self.assertEqual(channel.code, 200)
  340. if self.test_image.expected_scaled:
  341. self.assertEqual(
  342. channel.result["body"],
  343. self.test_image.expected_scaled,
  344. channel.result["body"],
  345. )
  346. # Deleting the thumbnail on disk then re-requesting it should work as
  347. # Synapse should regenerate missing thumbnails.
  348. origin, media_id = self.media_id.split("/")
  349. info = self.get_success(self.store.get_cached_remote_media(origin, media_id))
  350. file_id = info["filesystem_id"]
  351. thumbnail_dir = self.media_repo.filepaths.remote_media_thumbnail_dir(
  352. origin, file_id
  353. )
  354. shutil.rmtree(thumbnail_dir, ignore_errors=True)
  355. channel = make_request(
  356. self.reactor,
  357. FakeSite(self.thumbnail_resource, self.reactor),
  358. "GET",
  359. self.media_id + params,
  360. shorthand=False,
  361. await_result=False,
  362. )
  363. self.pump()
  364. self.assertEqual(channel.code, 200)
  365. if self.test_image.expected_scaled:
  366. self.assertEqual(
  367. channel.result["body"],
  368. self.test_image.expected_scaled,
  369. channel.result["body"],
  370. )
  371. def _test_thumbnail(self, method, expected_body, expected_found):
  372. params = "?width=32&height=32&method=" + method
  373. channel = make_request(
  374. self.reactor,
  375. FakeSite(self.thumbnail_resource, self.reactor),
  376. "GET",
  377. self.media_id + params,
  378. shorthand=False,
  379. await_result=False,
  380. )
  381. self.pump()
  382. headers = {
  383. b"Content-Length": [b"%d" % (len(self.test_image.data))],
  384. b"Content-Type": [self.test_image.content_type],
  385. }
  386. self.fetches[0][0].callback(
  387. (self.test_image.data, (len(self.test_image.data), headers))
  388. )
  389. self.pump()
  390. if expected_found:
  391. self.assertEqual(channel.code, 200)
  392. if expected_body is not None:
  393. self.assertEqual(
  394. channel.result["body"], expected_body, channel.result["body"]
  395. )
  396. else:
  397. # ensure that the result is at least some valid image
  398. Image.open(BytesIO(channel.result["body"]))
  399. else:
  400. # A 404 with a JSON body.
  401. self.assertEqual(channel.code, 404)
  402. self.assertEqual(
  403. channel.json_body,
  404. {
  405. "errcode": "M_NOT_FOUND",
  406. "error": "Not found [b'example.com', b'12345']",
  407. },
  408. )
  409. @parameterized.expand([("crop", 16), ("crop", 64), ("scale", 16), ("scale", 64)])
  410. def test_same_quality(self, method, desired_size):
  411. """Test that choosing between thumbnails with the same quality rating succeeds.
  412. We are not particular about which thumbnail is chosen."""
  413. self.assertIsNotNone(
  414. self.thumbnail_resource._select_thumbnail(
  415. desired_width=desired_size,
  416. desired_height=desired_size,
  417. desired_method=method,
  418. desired_type=self.test_image.content_type,
  419. # Provide two identical thumbnails which are guaranteed to have the same
  420. # quality rating.
  421. thumbnail_infos=[
  422. {
  423. "thumbnail_width": 32,
  424. "thumbnail_height": 32,
  425. "thumbnail_method": method,
  426. "thumbnail_type": self.test_image.content_type,
  427. "thumbnail_length": 256,
  428. "filesystem_id": f"thumbnail1{self.test_image.extension}",
  429. },
  430. {
  431. "thumbnail_width": 32,
  432. "thumbnail_height": 32,
  433. "thumbnail_method": method,
  434. "thumbnail_type": self.test_image.content_type,
  435. "thumbnail_length": 256,
  436. "filesystem_id": f"thumbnail2{self.test_image.extension}",
  437. },
  438. ],
  439. file_id=f"image{self.test_image.extension}",
  440. url_cache=None,
  441. server_name=None,
  442. )
  443. )
  444. def test_x_robots_tag_header(self):
  445. """
  446. Tests that the `X-Robots-Tag` header is present, which informs web crawlers
  447. to not index, archive, or follow links in media.
  448. """
  449. channel = self._req(b"inline; filename=out" + self.test_image.extension)
  450. headers = channel.headers
  451. self.assertEqual(
  452. headers.getRawHeaders(b"X-Robots-Tag"),
  453. [b"noindex, nofollow, noarchive, noimageindex"],
  454. )
  455. class TestSpamChecker:
  456. """A spam checker module that rejects all media that includes the bytes
  457. `evil`.
  458. """
  459. def __init__(self, config, api):
  460. self.config = config
  461. self.api = api
  462. def parse_config(config):
  463. return config
  464. async def check_event_for_spam(self, foo):
  465. return False # allow all events
  466. async def user_may_invite(self, inviter_userid, invitee_userid, room_id):
  467. return True # allow all invites
  468. async def user_may_create_room(self, userid):
  469. return True # allow all room creations
  470. async def user_may_create_room_alias(self, userid, room_alias):
  471. return True # allow all room aliases
  472. async def user_may_publish_room(self, userid, room_id):
  473. return True # allow publishing of all rooms
  474. async def check_media_file_for_spam(self, file_wrapper, file_info) -> bool:
  475. buf = BytesIO()
  476. await file_wrapper.write_chunks_to(buf.write)
  477. return b"evil" in buf.getvalue()
  478. class SpamCheckerTestCase(unittest.HomeserverTestCase):
  479. servlets = [
  480. login.register_servlets,
  481. admin.register_servlets,
  482. ]
  483. def prepare(self, reactor, clock, hs):
  484. self.user = self.register_user("user", "pass")
  485. self.tok = self.login("user", "pass")
  486. # Allow for uploading and downloading to/from the media repo
  487. self.media_repo = hs.get_media_repository_resource()
  488. self.download_resource = self.media_repo.children[b"download"]
  489. self.upload_resource = self.media_repo.children[b"upload"]
  490. load_legacy_spam_checkers(hs)
  491. def default_config(self):
  492. config = default_config("test")
  493. config.update(
  494. {
  495. "spam_checker": [
  496. {
  497. "module": TestSpamChecker.__module__ + ".TestSpamChecker",
  498. "config": {},
  499. }
  500. ]
  501. }
  502. )
  503. return config
  504. def test_upload_innocent(self):
  505. """Attempt to upload some innocent data that should be allowed."""
  506. self.helper.upload_media(
  507. self.upload_resource, SMALL_PNG, tok=self.tok, expect_code=200
  508. )
  509. def test_upload_ban(self):
  510. """Attempt to upload some data that includes bytes "evil", which should
  511. get rejected by the spam checker.
  512. """
  513. data = b"Some evil data"
  514. self.helper.upload_media(
  515. self.upload_resource, data, tok=self.tok, expect_code=400
  516. )