test_url_preview.py 37 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117111811191120
  1. # Copyright 2018 New Vector Ltd
  2. # Copyright 2021 The Matrix.org Foundation C.I.C.
  3. #
  4. # Licensed under the Apache License, Version 2.0 (the "License");
  5. # you may not use this file except in compliance with the License.
  6. # You may obtain a copy of the License at
  7. #
  8. # http://www.apache.org/licenses/LICENSE-2.0
  9. #
  10. # Unless required by applicable law or agreed to in writing, software
  11. # distributed under the License is distributed on an "AS IS" BASIS,
  12. # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  13. # See the License for the specific language governing permissions and
  14. # limitations under the License.
  15. import base64
  16. import json
  17. import os
  18. import re
  19. from urllib.parse import urlencode
  20. from twisted.internet._resolver import HostResolution
  21. from twisted.internet.address import IPv4Address, IPv6Address
  22. from twisted.internet.error import DNSLookupError
  23. from twisted.test.proto_helpers import AccumulatingProtocol
  24. from synapse.config.oembed import OEmbedEndpointConfig
  25. from synapse.rest.media.v1.preview_url_resource import IMAGE_CACHE_EXPIRY_MS
  26. from synapse.types import JsonDict
  27. from synapse.util.stringutils import parse_and_validate_mxc_uri
  28. from tests import unittest
  29. from tests.server import FakeTransport
  30. from tests.test_utils import SMALL_PNG
  31. from tests.utils import MockClock
  32. try:
  33. import lxml
  34. except ImportError:
  35. lxml = None
  36. class URLPreviewTests(unittest.HomeserverTestCase):
  37. if not lxml:
  38. skip = "url preview feature requires lxml"
  39. hijack_auth = True
  40. user_id = "@test:user"
  41. end_content = (
  42. b"<html><head>"
  43. b'<meta property="og:title" content="~matrix~" />'
  44. b'<meta property="og:description" content="hi" />'
  45. b"</head></html>"
  46. )
  47. def make_homeserver(self, reactor, clock):
  48. config = self.default_config()
  49. config["url_preview_enabled"] = True
  50. config["max_spider_size"] = 9999999
  51. config["url_preview_ip_range_blacklist"] = (
  52. "192.168.1.1",
  53. "1.0.0.0/8",
  54. "3fff:ffff:ffff:ffff:ffff:ffff:ffff:ffff",
  55. "2001:800::/21",
  56. )
  57. config["url_preview_ip_range_whitelist"] = ("1.1.1.1",)
  58. config["url_preview_url_blacklist"] = []
  59. config["url_preview_accept_language"] = [
  60. "en-UK",
  61. "en-US;q=0.9",
  62. "fr;q=0.8",
  63. "*;q=0.7",
  64. ]
  65. self.storage_path = self.mktemp()
  66. self.media_store_path = self.mktemp()
  67. os.mkdir(self.storage_path)
  68. os.mkdir(self.media_store_path)
  69. config["media_store_path"] = self.media_store_path
  70. provider_config = {
  71. "module": "synapse.rest.media.v1.storage_provider.FileStorageProviderBackend",
  72. "store_local": True,
  73. "store_synchronous": False,
  74. "store_remote": True,
  75. "config": {"directory": self.storage_path},
  76. }
  77. config["media_storage_providers"] = [provider_config]
  78. hs = self.setup_test_homeserver(config=config)
  79. # After the hs is created, modify the parsed oEmbed config (to avoid
  80. # messing with files).
  81. #
  82. # Note that HTTP URLs are used to avoid having to deal with TLS in tests.
  83. hs.config.oembed.oembed_patterns = [
  84. OEmbedEndpointConfig(
  85. api_endpoint="http://publish.twitter.com/oembed",
  86. url_patterns=[
  87. re.compile(r"http://twitter\.com/.+/status/.+"),
  88. ],
  89. formats=None,
  90. ),
  91. OEmbedEndpointConfig(
  92. api_endpoint="http://www.hulu.com/api/oembed.{format}",
  93. url_patterns=[
  94. re.compile(r"http://www\.hulu\.com/watch/.+"),
  95. ],
  96. formats=["json"],
  97. ),
  98. ]
  99. return hs
  100. def prepare(self, reactor, clock, hs):
  101. self.media_repo = hs.get_media_repository_resource()
  102. self.preview_url = self.media_repo.children[b"preview_url"]
  103. self.lookups = {}
  104. class Resolver:
  105. def resolveHostName(
  106. _self,
  107. resolutionReceiver,
  108. hostName,
  109. portNumber=0,
  110. addressTypes=None,
  111. transportSemantics="TCP",
  112. ):
  113. resolution = HostResolution(hostName)
  114. resolutionReceiver.resolutionBegan(resolution)
  115. if hostName not in self.lookups:
  116. raise DNSLookupError("OH NO")
  117. for i in self.lookups[hostName]:
  118. resolutionReceiver.addressResolved(i[0]("TCP", i[1], portNumber))
  119. resolutionReceiver.resolutionComplete()
  120. return resolutionReceiver
  121. self.reactor.nameResolver = Resolver()
  122. def create_test_resource(self):
  123. return self.hs.get_media_repository_resource()
  124. def _assert_small_png(self, json_body: JsonDict) -> None:
  125. """Assert properties from the SMALL_PNG test image."""
  126. self.assertTrue(json_body["og:image"].startswith("mxc://"))
  127. self.assertEqual(json_body["og:image:height"], 1)
  128. self.assertEqual(json_body["og:image:width"], 1)
  129. self.assertEqual(json_body["og:image:type"], "image/png")
  130. self.assertEqual(json_body["matrix:image:size"], 67)
  131. def test_cache_returns_correct_type(self):
  132. self.lookups["matrix.org"] = [(IPv4Address, "10.1.2.3")]
  133. channel = self.make_request(
  134. "GET",
  135. "preview_url?url=http://matrix.org",
  136. shorthand=False,
  137. await_result=False,
  138. )
  139. self.pump()
  140. client = self.reactor.tcpClients[0][2].buildProtocol(None)
  141. server = AccumulatingProtocol()
  142. server.makeConnection(FakeTransport(client, self.reactor))
  143. client.makeConnection(FakeTransport(server, self.reactor))
  144. client.dataReceived(
  145. b"HTTP/1.0 200 OK\r\nContent-Length: %d\r\nContent-Type: text/html\r\n\r\n"
  146. % (len(self.end_content),)
  147. + self.end_content
  148. )
  149. self.pump()
  150. self.assertEqual(channel.code, 200)
  151. self.assertEqual(
  152. channel.json_body, {"og:title": "~matrix~", "og:description": "hi"}
  153. )
  154. # Check the cache returns the correct response
  155. channel = self.make_request(
  156. "GET", "preview_url?url=http://matrix.org", shorthand=False
  157. )
  158. # Check the cache response has the same content
  159. self.assertEqual(channel.code, 200)
  160. self.assertEqual(
  161. channel.json_body, {"og:title": "~matrix~", "og:description": "hi"}
  162. )
  163. # Clear the in-memory cache
  164. self.assertIn("http://matrix.org", self.preview_url._cache)
  165. self.preview_url._cache.pop("http://matrix.org")
  166. self.assertNotIn("http://matrix.org", self.preview_url._cache)
  167. # Check the database cache returns the correct response
  168. channel = self.make_request(
  169. "GET", "preview_url?url=http://matrix.org", shorthand=False
  170. )
  171. # Check the cache response has the same content
  172. self.assertEqual(channel.code, 200)
  173. self.assertEqual(
  174. channel.json_body, {"og:title": "~matrix~", "og:description": "hi"}
  175. )
  176. def test_non_ascii_preview_httpequiv(self):
  177. self.lookups["matrix.org"] = [(IPv4Address, "10.1.2.3")]
  178. end_content = (
  179. b"<html><head>"
  180. b'<meta http-equiv="Content-Type" content="text/html; charset=windows-1251"/>'
  181. b'<meta property="og:title" content="\xe4\xea\xe0" />'
  182. b'<meta property="og:description" content="hi" />'
  183. b"</head></html>"
  184. )
  185. channel = self.make_request(
  186. "GET",
  187. "preview_url?url=http://matrix.org",
  188. shorthand=False,
  189. await_result=False,
  190. )
  191. self.pump()
  192. client = self.reactor.tcpClients[0][2].buildProtocol(None)
  193. server = AccumulatingProtocol()
  194. server.makeConnection(FakeTransport(client, self.reactor))
  195. client.makeConnection(FakeTransport(server, self.reactor))
  196. client.dataReceived(
  197. (
  198. b"HTTP/1.0 200 OK\r\nContent-Length: %d\r\n"
  199. b'Content-Type: text/html; charset="utf8"\r\n\r\n'
  200. )
  201. % (len(end_content),)
  202. + end_content
  203. )
  204. self.pump()
  205. self.assertEqual(channel.code, 200)
  206. self.assertEqual(channel.json_body["og:title"], "\u0434\u043a\u0430")
  207. def test_video_rejected(self):
  208. self.lookups["matrix.org"] = [(IPv4Address, "10.1.2.3")]
  209. end_content = b"anything"
  210. channel = self.make_request(
  211. "GET",
  212. "preview_url?url=http://matrix.org",
  213. shorthand=False,
  214. await_result=False,
  215. )
  216. self.pump()
  217. client = self.reactor.tcpClients[0][2].buildProtocol(None)
  218. server = AccumulatingProtocol()
  219. server.makeConnection(FakeTransport(client, self.reactor))
  220. client.makeConnection(FakeTransport(server, self.reactor))
  221. client.dataReceived(
  222. (
  223. b"HTTP/1.0 200 OK\r\nContent-Length: %d\r\n"
  224. b"Content-Type: video/mp4\r\n\r\n"
  225. )
  226. % (len(end_content))
  227. + end_content
  228. )
  229. self.pump()
  230. self.assertEqual(channel.code, 502)
  231. self.assertEqual(
  232. channel.json_body,
  233. {
  234. "errcode": "M_UNKNOWN",
  235. "error": "Requested file's content type not allowed for this operation: video/mp4",
  236. },
  237. )
  238. def test_audio_rejected(self):
  239. self.lookups["matrix.org"] = [(IPv4Address, "10.1.2.3")]
  240. end_content = b"anything"
  241. channel = self.make_request(
  242. "GET",
  243. "preview_url?url=http://matrix.org",
  244. shorthand=False,
  245. await_result=False,
  246. )
  247. self.pump()
  248. client = self.reactor.tcpClients[0][2].buildProtocol(None)
  249. server = AccumulatingProtocol()
  250. server.makeConnection(FakeTransport(client, self.reactor))
  251. client.makeConnection(FakeTransport(server, self.reactor))
  252. client.dataReceived(
  253. (
  254. b"HTTP/1.0 200 OK\r\nContent-Length: %d\r\n"
  255. b"Content-Type: audio/aac\r\n\r\n"
  256. )
  257. % (len(end_content))
  258. + end_content
  259. )
  260. self.pump()
  261. self.assertEqual(channel.code, 502)
  262. self.assertEqual(
  263. channel.json_body,
  264. {
  265. "errcode": "M_UNKNOWN",
  266. "error": "Requested file's content type not allowed for this operation: audio/aac",
  267. },
  268. )
  269. def test_non_ascii_preview_content_type(self):
  270. self.lookups["matrix.org"] = [(IPv4Address, "10.1.2.3")]
  271. end_content = (
  272. b"<html><head>"
  273. b'<meta property="og:title" content="\xe4\xea\xe0" />'
  274. b'<meta property="og:description" content="hi" />'
  275. b"</head></html>"
  276. )
  277. channel = self.make_request(
  278. "GET",
  279. "preview_url?url=http://matrix.org",
  280. shorthand=False,
  281. await_result=False,
  282. )
  283. self.pump()
  284. client = self.reactor.tcpClients[0][2].buildProtocol(None)
  285. server = AccumulatingProtocol()
  286. server.makeConnection(FakeTransport(client, self.reactor))
  287. client.makeConnection(FakeTransport(server, self.reactor))
  288. client.dataReceived(
  289. (
  290. b"HTTP/1.0 200 OK\r\nContent-Length: %d\r\n"
  291. b'Content-Type: text/html; charset="windows-1251"\r\n\r\n'
  292. )
  293. % (len(end_content),)
  294. + end_content
  295. )
  296. self.pump()
  297. self.assertEqual(channel.code, 200)
  298. self.assertEqual(channel.json_body["og:title"], "\u0434\u043a\u0430")
  299. def test_overlong_title(self):
  300. self.lookups["matrix.org"] = [(IPv4Address, "10.1.2.3")]
  301. end_content = (
  302. b"<html><head>"
  303. b"<title>" + b"x" * 2000 + b"</title>"
  304. b'<meta property="og:description" content="hi" />'
  305. b"</head></html>"
  306. )
  307. channel = self.make_request(
  308. "GET",
  309. "preview_url?url=http://matrix.org",
  310. shorthand=False,
  311. await_result=False,
  312. )
  313. self.pump()
  314. client = self.reactor.tcpClients[0][2].buildProtocol(None)
  315. server = AccumulatingProtocol()
  316. server.makeConnection(FakeTransport(client, self.reactor))
  317. client.makeConnection(FakeTransport(server, self.reactor))
  318. client.dataReceived(
  319. (
  320. b"HTTP/1.0 200 OK\r\nContent-Length: %d\r\n"
  321. b'Content-Type: text/html; charset="windows-1251"\r\n\r\n'
  322. )
  323. % (len(end_content),)
  324. + end_content
  325. )
  326. self.pump()
  327. self.assertEqual(channel.code, 200)
  328. res = channel.json_body
  329. # We should only see the `og:description` field, as `title` is too long and should be stripped out
  330. self.assertCountEqual(["og:description"], res.keys())
  331. def test_ipaddr(self):
  332. """
  333. IP addresses can be previewed directly.
  334. """
  335. self.lookups["example.com"] = [(IPv4Address, "10.1.2.3")]
  336. channel = self.make_request(
  337. "GET",
  338. "preview_url?url=http://example.com",
  339. shorthand=False,
  340. await_result=False,
  341. )
  342. self.pump()
  343. client = self.reactor.tcpClients[0][2].buildProtocol(None)
  344. server = AccumulatingProtocol()
  345. server.makeConnection(FakeTransport(client, self.reactor))
  346. client.makeConnection(FakeTransport(server, self.reactor))
  347. client.dataReceived(
  348. b"HTTP/1.0 200 OK\r\nContent-Length: %d\r\nContent-Type: text/html\r\n\r\n"
  349. % (len(self.end_content),)
  350. + self.end_content
  351. )
  352. self.pump()
  353. self.assertEqual(channel.code, 200)
  354. self.assertEqual(
  355. channel.json_body, {"og:title": "~matrix~", "og:description": "hi"}
  356. )
  357. def test_blacklisted_ip_specific(self):
  358. """
  359. Blacklisted IP addresses, found via DNS, are not spidered.
  360. """
  361. self.lookups["example.com"] = [(IPv4Address, "192.168.1.1")]
  362. channel = self.make_request(
  363. "GET", "preview_url?url=http://example.com", shorthand=False
  364. )
  365. # No requests made.
  366. self.assertEqual(len(self.reactor.tcpClients), 0)
  367. self.assertEqual(channel.code, 502)
  368. self.assertEqual(
  369. channel.json_body,
  370. {
  371. "errcode": "M_UNKNOWN",
  372. "error": "DNS resolution failure during URL preview generation",
  373. },
  374. )
  375. def test_blacklisted_ip_range(self):
  376. """
  377. Blacklisted IP ranges, IPs found over DNS, are not spidered.
  378. """
  379. self.lookups["example.com"] = [(IPv4Address, "1.1.1.2")]
  380. channel = self.make_request(
  381. "GET", "preview_url?url=http://example.com", shorthand=False
  382. )
  383. self.assertEqual(channel.code, 502)
  384. self.assertEqual(
  385. channel.json_body,
  386. {
  387. "errcode": "M_UNKNOWN",
  388. "error": "DNS resolution failure during URL preview generation",
  389. },
  390. )
  391. def test_blacklisted_ip_specific_direct(self):
  392. """
  393. Blacklisted IP addresses, accessed directly, are not spidered.
  394. """
  395. channel = self.make_request(
  396. "GET", "preview_url?url=http://192.168.1.1", shorthand=False
  397. )
  398. # No requests made.
  399. self.assertEqual(len(self.reactor.tcpClients), 0)
  400. self.assertEqual(
  401. channel.json_body,
  402. {
  403. "errcode": "M_UNKNOWN",
  404. "error": "IP address blocked by IP blacklist entry",
  405. },
  406. )
  407. self.assertEqual(channel.code, 403)
  408. def test_blacklisted_ip_range_direct(self):
  409. """
  410. Blacklisted IP ranges, accessed directly, are not spidered.
  411. """
  412. channel = self.make_request(
  413. "GET", "preview_url?url=http://1.1.1.2", shorthand=False
  414. )
  415. self.assertEqual(channel.code, 403)
  416. self.assertEqual(
  417. channel.json_body,
  418. {
  419. "errcode": "M_UNKNOWN",
  420. "error": "IP address blocked by IP blacklist entry",
  421. },
  422. )
  423. def test_blacklisted_ip_range_whitelisted_ip(self):
  424. """
  425. Blacklisted but then subsequently whitelisted IP addresses can be
  426. spidered.
  427. """
  428. self.lookups["example.com"] = [(IPv4Address, "1.1.1.1")]
  429. channel = self.make_request(
  430. "GET",
  431. "preview_url?url=http://example.com",
  432. shorthand=False,
  433. await_result=False,
  434. )
  435. self.pump()
  436. client = self.reactor.tcpClients[0][2].buildProtocol(None)
  437. server = AccumulatingProtocol()
  438. server.makeConnection(FakeTransport(client, self.reactor))
  439. client.makeConnection(FakeTransport(server, self.reactor))
  440. client.dataReceived(
  441. b"HTTP/1.0 200 OK\r\nContent-Length: %d\r\nContent-Type: text/html\r\n\r\n"
  442. % (len(self.end_content),)
  443. + self.end_content
  444. )
  445. self.pump()
  446. self.assertEqual(channel.code, 200)
  447. self.assertEqual(
  448. channel.json_body, {"og:title": "~matrix~", "og:description": "hi"}
  449. )
  450. def test_blacklisted_ip_with_external_ip(self):
  451. """
  452. If a hostname resolves a blacklisted IP, even if there's a
  453. non-blacklisted one, it will be rejected.
  454. """
  455. # Hardcode the URL resolving to the IP we want.
  456. self.lookups["example.com"] = [
  457. (IPv4Address, "1.1.1.2"),
  458. (IPv4Address, "10.1.2.3"),
  459. ]
  460. channel = self.make_request(
  461. "GET", "preview_url?url=http://example.com", shorthand=False
  462. )
  463. self.assertEqual(channel.code, 502)
  464. self.assertEqual(
  465. channel.json_body,
  466. {
  467. "errcode": "M_UNKNOWN",
  468. "error": "DNS resolution failure during URL preview generation",
  469. },
  470. )
  471. def test_blacklisted_ipv6_specific(self):
  472. """
  473. Blacklisted IP addresses, found via DNS, are not spidered.
  474. """
  475. self.lookups["example.com"] = [
  476. (IPv6Address, "3fff:ffff:ffff:ffff:ffff:ffff:ffff:ffff")
  477. ]
  478. channel = self.make_request(
  479. "GET", "preview_url?url=http://example.com", shorthand=False
  480. )
  481. # No requests made.
  482. self.assertEqual(len(self.reactor.tcpClients), 0)
  483. self.assertEqual(channel.code, 502)
  484. self.assertEqual(
  485. channel.json_body,
  486. {
  487. "errcode": "M_UNKNOWN",
  488. "error": "DNS resolution failure during URL preview generation",
  489. },
  490. )
  491. def test_blacklisted_ipv6_range(self):
  492. """
  493. Blacklisted IP ranges, IPs found over DNS, are not spidered.
  494. """
  495. self.lookups["example.com"] = [(IPv6Address, "2001:800::1")]
  496. channel = self.make_request(
  497. "GET", "preview_url?url=http://example.com", shorthand=False
  498. )
  499. self.assertEqual(channel.code, 502)
  500. self.assertEqual(
  501. channel.json_body,
  502. {
  503. "errcode": "M_UNKNOWN",
  504. "error": "DNS resolution failure during URL preview generation",
  505. },
  506. )
  507. def test_OPTIONS(self):
  508. """
  509. OPTIONS returns the OPTIONS.
  510. """
  511. channel = self.make_request(
  512. "OPTIONS", "preview_url?url=http://example.com", shorthand=False
  513. )
  514. self.assertEqual(channel.code, 200)
  515. self.assertEqual(channel.json_body, {})
  516. def test_accept_language_config_option(self):
  517. """
  518. Accept-Language header is sent to the remote server
  519. """
  520. self.lookups["example.com"] = [(IPv4Address, "10.1.2.3")]
  521. # Build and make a request to the server
  522. channel = self.make_request(
  523. "GET",
  524. "preview_url?url=http://example.com",
  525. shorthand=False,
  526. await_result=False,
  527. )
  528. self.pump()
  529. # Extract Synapse's tcp client
  530. client = self.reactor.tcpClients[0][2].buildProtocol(None)
  531. # Build a fake remote server to reply with
  532. server = AccumulatingProtocol()
  533. # Connect the two together
  534. server.makeConnection(FakeTransport(client, self.reactor))
  535. client.makeConnection(FakeTransport(server, self.reactor))
  536. # Tell Synapse that it has received some data from the remote server
  537. client.dataReceived(
  538. b"HTTP/1.0 200 OK\r\nContent-Length: %d\r\nContent-Type: text/html\r\n\r\n"
  539. % (len(self.end_content),)
  540. + self.end_content
  541. )
  542. # Move the reactor along until we get a response on our original channel
  543. self.pump()
  544. self.assertEqual(channel.code, 200)
  545. self.assertEqual(
  546. channel.json_body, {"og:title": "~matrix~", "og:description": "hi"}
  547. )
  548. # Check that the server received the Accept-Language header as part
  549. # of the request from Synapse
  550. self.assertIn(
  551. (
  552. b"Accept-Language: en-UK\r\n"
  553. b"Accept-Language: en-US;q=0.9\r\n"
  554. b"Accept-Language: fr;q=0.8\r\n"
  555. b"Accept-Language: *;q=0.7"
  556. ),
  557. server.data,
  558. )
  559. def test_data_url(self):
  560. """
  561. Requesting to preview a data URL is not supported.
  562. """
  563. self.lookups["matrix.org"] = [(IPv4Address, "10.1.2.3")]
  564. data = base64.b64encode(SMALL_PNG).decode()
  565. query_params = urlencode(
  566. {
  567. "url": f'<html><head><img src="data:image/png;base64,{data}" /></head></html>'
  568. }
  569. )
  570. channel = self.make_request(
  571. "GET",
  572. f"preview_url?{query_params}",
  573. shorthand=False,
  574. )
  575. self.pump()
  576. self.assertEqual(channel.code, 500)
  577. def test_inline_data_url(self):
  578. """
  579. An inline image (as a data URL) should be parsed properly.
  580. """
  581. self.lookups["matrix.org"] = [(IPv4Address, "10.1.2.3")]
  582. data = base64.b64encode(SMALL_PNG)
  583. end_content = (
  584. b"<html><head>" b'<img src="data:image/png;base64,%s" />' b"</head></html>"
  585. ) % (data,)
  586. channel = self.make_request(
  587. "GET",
  588. "preview_url?url=http://matrix.org",
  589. shorthand=False,
  590. await_result=False,
  591. )
  592. self.pump()
  593. client = self.reactor.tcpClients[0][2].buildProtocol(None)
  594. server = AccumulatingProtocol()
  595. server.makeConnection(FakeTransport(client, self.reactor))
  596. client.makeConnection(FakeTransport(server, self.reactor))
  597. client.dataReceived(
  598. (
  599. b"HTTP/1.0 200 OK\r\nContent-Length: %d\r\n"
  600. b'Content-Type: text/html; charset="utf8"\r\n\r\n'
  601. )
  602. % (len(end_content),)
  603. + end_content
  604. )
  605. self.pump()
  606. self.assertEqual(channel.code, 200)
  607. self._assert_small_png(channel.json_body)
  608. def test_oembed_photo(self):
  609. """Test an oEmbed endpoint which returns a 'photo' type which redirects the preview to a new URL."""
  610. self.lookups["publish.twitter.com"] = [(IPv4Address, "10.1.2.3")]
  611. self.lookups["cdn.twitter.com"] = [(IPv4Address, "10.1.2.3")]
  612. result = {
  613. "version": "1.0",
  614. "type": "photo",
  615. "url": "http://cdn.twitter.com/matrixdotorg",
  616. }
  617. oembed_content = json.dumps(result).encode("utf-8")
  618. channel = self.make_request(
  619. "GET",
  620. "preview_url?url=http://twitter.com/matrixdotorg/status/12345",
  621. shorthand=False,
  622. await_result=False,
  623. )
  624. self.pump()
  625. client = self.reactor.tcpClients[0][2].buildProtocol(None)
  626. server = AccumulatingProtocol()
  627. server.makeConnection(FakeTransport(client, self.reactor))
  628. client.makeConnection(FakeTransport(server, self.reactor))
  629. client.dataReceived(
  630. (
  631. b"HTTP/1.0 200 OK\r\nContent-Length: %d\r\n"
  632. b'Content-Type: application/json; charset="utf8"\r\n\r\n'
  633. )
  634. % (len(oembed_content),)
  635. + oembed_content
  636. )
  637. self.pump()
  638. # Ensure a second request is made to the photo URL.
  639. client = self.reactor.tcpClients[1][2].buildProtocol(None)
  640. server = AccumulatingProtocol()
  641. server.makeConnection(FakeTransport(client, self.reactor))
  642. client.makeConnection(FakeTransport(server, self.reactor))
  643. client.dataReceived(
  644. (
  645. b"HTTP/1.0 200 OK\r\nContent-Length: %d\r\n"
  646. b"Content-Type: image/png\r\n\r\n"
  647. )
  648. % (len(SMALL_PNG),)
  649. + SMALL_PNG
  650. )
  651. self.pump()
  652. # Ensure the URL is what was requested.
  653. self.assertIn(b"/matrixdotorg", server.data)
  654. self.assertEqual(channel.code, 200)
  655. body = channel.json_body
  656. self.assertEqual(body["og:url"], "http://twitter.com/matrixdotorg/status/12345")
  657. self._assert_small_png(body)
  658. def test_oembed_rich(self):
  659. """Test an oEmbed endpoint which returns HTML content via the 'rich' type."""
  660. self.lookups["publish.twitter.com"] = [(IPv4Address, "10.1.2.3")]
  661. result = {
  662. "version": "1.0",
  663. "type": "rich",
  664. # Note that this provides the author, not the title.
  665. "author_name": "Alice",
  666. "html": "<div>Content Preview</div>",
  667. }
  668. end_content = json.dumps(result).encode("utf-8")
  669. channel = self.make_request(
  670. "GET",
  671. "preview_url?url=http://twitter.com/matrixdotorg/status/12345",
  672. shorthand=False,
  673. await_result=False,
  674. )
  675. self.pump()
  676. client = self.reactor.tcpClients[0][2].buildProtocol(None)
  677. server = AccumulatingProtocol()
  678. server.makeConnection(FakeTransport(client, self.reactor))
  679. client.makeConnection(FakeTransport(server, self.reactor))
  680. client.dataReceived(
  681. (
  682. b"HTTP/1.0 200 OK\r\nContent-Length: %d\r\n"
  683. b'Content-Type: application/json; charset="utf8"\r\n\r\n'
  684. )
  685. % (len(end_content),)
  686. + end_content
  687. )
  688. self.pump()
  689. self.assertEqual(channel.code, 200)
  690. body = channel.json_body
  691. self.assertEqual(
  692. body,
  693. {
  694. "og:url": "http://twitter.com/matrixdotorg/status/12345",
  695. "og:title": "Alice",
  696. "og:description": "Content Preview",
  697. },
  698. )
  699. def test_oembed_format(self):
  700. """Test an oEmbed endpoint which requires the format in the URL."""
  701. self.lookups["www.hulu.com"] = [(IPv4Address, "10.1.2.3")]
  702. result = {
  703. "version": "1.0",
  704. "type": "rich",
  705. "html": "<div>Content Preview</div>",
  706. }
  707. end_content = json.dumps(result).encode("utf-8")
  708. channel = self.make_request(
  709. "GET",
  710. "preview_url?url=http://www.hulu.com/watch/12345",
  711. shorthand=False,
  712. await_result=False,
  713. )
  714. self.pump()
  715. client = self.reactor.tcpClients[0][2].buildProtocol(None)
  716. server = AccumulatingProtocol()
  717. server.makeConnection(FakeTransport(client, self.reactor))
  718. client.makeConnection(FakeTransport(server, self.reactor))
  719. client.dataReceived(
  720. (
  721. b"HTTP/1.0 200 OK\r\nContent-Length: %d\r\n"
  722. b'Content-Type: application/json; charset="utf8"\r\n\r\n'
  723. )
  724. % (len(end_content),)
  725. + end_content
  726. )
  727. self.pump()
  728. # The {format} should have been turned into json.
  729. self.assertIn(b"/api/oembed.json", server.data)
  730. # A URL parameter of format=json should be provided.
  731. self.assertIn(b"format=json", server.data)
  732. self.assertEqual(channel.code, 200)
  733. body = channel.json_body
  734. self.assertEqual(
  735. body,
  736. {
  737. "og:url": "http://www.hulu.com/watch/12345",
  738. "og:description": "Content Preview",
  739. },
  740. )
  741. def test_oembed_autodiscovery(self):
  742. """
  743. Autodiscovery works by finding the link in the HTML response and then requesting an oEmbed URL.
  744. 1. Request a preview of a URL which is not known to the oEmbed code.
  745. 2. It returns HTML including a link to an oEmbed preview.
  746. 3. The oEmbed preview is requested and returns a URL for an image.
  747. 4. The image is requested for thumbnailing.
  748. """
  749. # This is a little cheesy in that we use the www subdomain (which isn't the
  750. # list of oEmbed patterns) to get "raw" HTML response.
  751. self.lookups["www.twitter.com"] = [(IPv4Address, "10.1.2.3")]
  752. self.lookups["publish.twitter.com"] = [(IPv4Address, "10.1.2.3")]
  753. self.lookups["cdn.twitter.com"] = [(IPv4Address, "10.1.2.3")]
  754. result = b"""
  755. <link rel="alternate" type="application/json+oembed"
  756. href="http://publish.twitter.com/oembed?url=http%3A%2F%2Fcdn.twitter.com%2Fmatrixdotorg%2Fstatus%2F12345&format=json"
  757. title="matrixdotorg" />
  758. """
  759. channel = self.make_request(
  760. "GET",
  761. "preview_url?url=http://www.twitter.com/matrixdotorg/status/12345",
  762. shorthand=False,
  763. await_result=False,
  764. )
  765. self.pump()
  766. client = self.reactor.tcpClients[0][2].buildProtocol(None)
  767. server = AccumulatingProtocol()
  768. server.makeConnection(FakeTransport(client, self.reactor))
  769. client.makeConnection(FakeTransport(server, self.reactor))
  770. client.dataReceived(
  771. (
  772. b"HTTP/1.0 200 OK\r\nContent-Length: %d\r\n"
  773. b'Content-Type: text/html; charset="utf8"\r\n\r\n'
  774. )
  775. % (len(result),)
  776. + result
  777. )
  778. self.pump()
  779. # The oEmbed response.
  780. result2 = {
  781. "version": "1.0",
  782. "type": "photo",
  783. "url": "http://cdn.twitter.com/matrixdotorg",
  784. }
  785. oembed_content = json.dumps(result2).encode("utf-8")
  786. # Ensure a second request is made to the oEmbed URL.
  787. client = self.reactor.tcpClients[1][2].buildProtocol(None)
  788. server = AccumulatingProtocol()
  789. server.makeConnection(FakeTransport(client, self.reactor))
  790. client.makeConnection(FakeTransport(server, self.reactor))
  791. client.dataReceived(
  792. (
  793. b"HTTP/1.0 200 OK\r\nContent-Length: %d\r\n"
  794. b'Content-Type: application/json; charset="utf8"\r\n\r\n'
  795. )
  796. % (len(oembed_content),)
  797. + oembed_content
  798. )
  799. self.pump()
  800. # Ensure the URL is what was requested.
  801. self.assertIn(b"/oembed?", server.data)
  802. # Ensure a third request is made to the photo URL.
  803. client = self.reactor.tcpClients[2][2].buildProtocol(None)
  804. server = AccumulatingProtocol()
  805. server.makeConnection(FakeTransport(client, self.reactor))
  806. client.makeConnection(FakeTransport(server, self.reactor))
  807. client.dataReceived(
  808. (
  809. b"HTTP/1.0 200 OK\r\nContent-Length: %d\r\n"
  810. b"Content-Type: image/png\r\n\r\n"
  811. )
  812. % (len(SMALL_PNG),)
  813. + SMALL_PNG
  814. )
  815. self.pump()
  816. # Ensure the URL is what was requested.
  817. self.assertIn(b"/matrixdotorg", server.data)
  818. self.assertEqual(channel.code, 200)
  819. body = channel.json_body
  820. self.assertEqual(
  821. body["og:url"], "http://www.twitter.com/matrixdotorg/status/12345"
  822. )
  823. self._assert_small_png(body)
  824. def _download_image(self):
  825. """Downloads an image into the URL cache.
  826. Returns:
  827. A (host, media_id) tuple representing the MXC URI of the image.
  828. """
  829. self.lookups["cdn.twitter.com"] = [(IPv4Address, "10.1.2.3")]
  830. channel = self.make_request(
  831. "GET",
  832. "preview_url?url=http://cdn.twitter.com/matrixdotorg",
  833. shorthand=False,
  834. await_result=False,
  835. )
  836. self.pump()
  837. client = self.reactor.tcpClients[0][2].buildProtocol(None)
  838. server = AccumulatingProtocol()
  839. server.makeConnection(FakeTransport(client, self.reactor))
  840. client.makeConnection(FakeTransport(server, self.reactor))
  841. client.dataReceived(
  842. b"HTTP/1.0 200 OK\r\nContent-Length: %d\r\nContent-Type: image/png\r\n\r\n"
  843. % (len(SMALL_PNG),)
  844. + SMALL_PNG
  845. )
  846. self.pump()
  847. self.assertEqual(channel.code, 200)
  848. body = channel.json_body
  849. mxc_uri = body["og:image"]
  850. host, _port, media_id = parse_and_validate_mxc_uri(mxc_uri)
  851. self.assertIsNone(_port)
  852. return host, media_id
  853. def test_storage_providers_exclude_files(self):
  854. """Test that files are not stored in or fetched from storage providers."""
  855. host, media_id = self._download_image()
  856. rel_file_path = self.preview_url.filepaths.url_cache_filepath_rel(media_id)
  857. media_store_path = os.path.join(self.media_store_path, rel_file_path)
  858. storage_provider_path = os.path.join(self.storage_path, rel_file_path)
  859. # Check storage
  860. self.assertTrue(os.path.isfile(media_store_path))
  861. self.assertFalse(
  862. os.path.isfile(storage_provider_path),
  863. "URL cache file was unexpectedly stored in a storage provider",
  864. )
  865. # Check fetching
  866. channel = self.make_request(
  867. "GET",
  868. f"download/{host}/{media_id}",
  869. shorthand=False,
  870. await_result=False,
  871. )
  872. self.pump()
  873. self.assertEqual(channel.code, 200)
  874. # Move cached file into the storage provider
  875. os.makedirs(os.path.dirname(storage_provider_path), exist_ok=True)
  876. os.rename(media_store_path, storage_provider_path)
  877. channel = self.make_request(
  878. "GET",
  879. f"download/{host}/{media_id}",
  880. shorthand=False,
  881. await_result=False,
  882. )
  883. self.pump()
  884. self.assertEqual(
  885. channel.code,
  886. 404,
  887. "URL cache file was unexpectedly retrieved from a storage provider",
  888. )
  889. def test_storage_providers_exclude_thumbnails(self):
  890. """Test that thumbnails are not stored in or fetched from storage providers."""
  891. host, media_id = self._download_image()
  892. rel_thumbnail_path = (
  893. self.preview_url.filepaths.url_cache_thumbnail_directory_rel(media_id)
  894. )
  895. media_store_thumbnail_path = os.path.join(
  896. self.media_store_path, rel_thumbnail_path
  897. )
  898. storage_provider_thumbnail_path = os.path.join(
  899. self.storage_path, rel_thumbnail_path
  900. )
  901. # Check storage
  902. self.assertTrue(os.path.isdir(media_store_thumbnail_path))
  903. self.assertFalse(
  904. os.path.isdir(storage_provider_thumbnail_path),
  905. "URL cache thumbnails were unexpectedly stored in a storage provider",
  906. )
  907. # Check fetching
  908. channel = self.make_request(
  909. "GET",
  910. f"thumbnail/{host}/{media_id}?width=32&height=32&method=scale",
  911. shorthand=False,
  912. await_result=False,
  913. )
  914. self.pump()
  915. self.assertEqual(channel.code, 200)
  916. # Remove the original, otherwise thumbnails will regenerate
  917. rel_file_path = self.preview_url.filepaths.url_cache_filepath_rel(media_id)
  918. media_store_path = os.path.join(self.media_store_path, rel_file_path)
  919. os.remove(media_store_path)
  920. # Move cached thumbnails into the storage provider
  921. os.makedirs(os.path.dirname(storage_provider_thumbnail_path), exist_ok=True)
  922. os.rename(media_store_thumbnail_path, storage_provider_thumbnail_path)
  923. channel = self.make_request(
  924. "GET",
  925. f"thumbnail/{host}/{media_id}?width=32&height=32&method=scale",
  926. shorthand=False,
  927. await_result=False,
  928. )
  929. self.pump()
  930. self.assertEqual(
  931. channel.code,
  932. 404,
  933. "URL cache thumbnail was unexpectedly retrieved from a storage provider",
  934. )
  935. def test_cache_expiry(self):
  936. """Test that URL cache files and thumbnails are cleaned up properly on expiry."""
  937. self.preview_url.clock = MockClock()
  938. _host, media_id = self._download_image()
  939. file_path = self.preview_url.filepaths.url_cache_filepath(media_id)
  940. file_dirs = self.preview_url.filepaths.url_cache_filepath_dirs_to_delete(
  941. media_id
  942. )
  943. thumbnail_dir = self.preview_url.filepaths.url_cache_thumbnail_directory(
  944. media_id
  945. )
  946. thumbnail_dirs = self.preview_url.filepaths.url_cache_thumbnail_dirs_to_delete(
  947. media_id
  948. )
  949. self.assertTrue(os.path.isfile(file_path))
  950. self.assertTrue(os.path.isdir(thumbnail_dir))
  951. self.preview_url.clock.advance_time_msec(IMAGE_CACHE_EXPIRY_MS + 1)
  952. self.get_success(self.preview_url._expire_url_cache_data())
  953. for path in [file_path] + file_dirs + [thumbnail_dir] + thumbnail_dirs:
  954. self.assertFalse(
  955. os.path.exists(path),
  956. f"{os.path.relpath(path, self.media_store_path)} was not deleted",
  957. )