test_url_preview.py 48 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085108610871088108910901091109210931094109510961097109810991100110111021103110411051106110711081109111011111112111311141115111611171118111911201121112211231124112511261127112811291130113111321133113411351136113711381139114011411142114311441145114611471148114911501151115211531154115511561157115811591160116111621163116411651166116711681169117011711172117311741175117611771178117911801181118211831184118511861187118811891190119111921193119411951196119711981199120012011202120312041205120612071208120912101211121212131214121512161217121812191220122112221223122412251226122712281229123012311232123312341235123612371238123912401241124212431244124512461247124812491250125112521253125412551256125712581259126012611262126312641265126612671268126912701271127212731274127512761277127812791280128112821283128412851286128712881289129012911292129312941295129612971298129913001301130213031304130513061307130813091310131113121313131413151316131713181319132013211322132313241325132613271328132913301331133213331334133513361337133813391340134113421343134413451346134713481349135013511352135313541355135613571358135913601361136213631364136513661367136813691370137113721373137413751376137713781379138013811382138313841385138613871388138913901391139213931394139513961397139813991400140114021403140414051406140714081409141014111412
  1. # Copyright 2018 New Vector Ltd
  2. # Copyright 2021 The Matrix.org Foundation C.I.C.
  3. #
  4. # Licensed under the Apache License, Version 2.0 (the "License");
  5. # you may not use this file except in compliance with the License.
  6. # You may obtain a copy of the License at
  7. #
  8. # http://www.apache.org/licenses/LICENSE-2.0
  9. #
  10. # Unless required by applicable law or agreed to in writing, software
  11. # distributed under the License is distributed on an "AS IS" BASIS,
  12. # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  13. # See the License for the specific language governing permissions and
  14. # limitations under the License.
  15. import base64
  16. import json
  17. import os
  18. import re
  19. from typing import Any, Dict, Optional, Sequence, Tuple, Type
  20. from urllib.parse import quote, urlencode
  21. from twisted.internet._resolver import HostResolution
  22. from twisted.internet.address import IPv4Address, IPv6Address
  23. from twisted.internet.error import DNSLookupError
  24. from twisted.internet.interfaces import IAddress, IResolutionReceiver
  25. from twisted.test.proto_helpers import AccumulatingProtocol, MemoryReactor
  26. from synapse.config.oembed import OEmbedEndpointConfig
  27. from synapse.media.url_previewer import IMAGE_CACHE_EXPIRY_MS
  28. from synapse.rest.media.media_repository_resource import MediaRepositoryResource
  29. from synapse.server import HomeServer
  30. from synapse.types import JsonDict
  31. from synapse.util import Clock
  32. from synapse.util.stringutils import parse_and_validate_mxc_uri
  33. from tests import unittest
  34. from tests.server import FakeTransport
  35. from tests.test_utils import SMALL_PNG
  36. try:
  37. import lxml
  38. except ImportError:
  39. lxml = None # type: ignore[assignment]
  40. class URLPreviewTests(unittest.HomeserverTestCase):
  41. if not lxml:
  42. skip = "url preview feature requires lxml"
  43. hijack_auth = True
  44. user_id = "@test:user"
  45. end_content = (
  46. b"<html><head>"
  47. b'<meta property="og:title" content="~matrix~" />'
  48. b'<meta property="og:description" content="hi" />'
  49. b"</head></html>"
  50. )
  51. def make_homeserver(self, reactor: MemoryReactor, clock: Clock) -> HomeServer:
  52. config = self.default_config()
  53. config["url_preview_enabled"] = True
  54. config["max_spider_size"] = 9999999
  55. config["url_preview_ip_range_blacklist"] = (
  56. "192.168.1.1",
  57. "1.0.0.0/8",
  58. "3fff:ffff:ffff:ffff:ffff:ffff:ffff:ffff",
  59. "2001:800::/21",
  60. )
  61. config["url_preview_ip_range_whitelist"] = ("1.1.1.1",)
  62. config["url_preview_accept_language"] = [
  63. "en-UK",
  64. "en-US;q=0.9",
  65. "fr;q=0.8",
  66. "*;q=0.7",
  67. ]
  68. self.storage_path = self.mktemp()
  69. self.media_store_path = self.mktemp()
  70. os.mkdir(self.storage_path)
  71. os.mkdir(self.media_store_path)
  72. config["media_store_path"] = self.media_store_path
  73. provider_config = {
  74. "module": "synapse.media.storage_provider.FileStorageProviderBackend",
  75. "store_local": True,
  76. "store_synchronous": False,
  77. "store_remote": True,
  78. "config": {"directory": self.storage_path},
  79. }
  80. config["media_storage_providers"] = [provider_config]
  81. hs = self.setup_test_homeserver(config=config)
  82. # After the hs is created, modify the parsed oEmbed config (to avoid
  83. # messing with files).
  84. #
  85. # Note that HTTP URLs are used to avoid having to deal with TLS in tests.
  86. hs.config.oembed.oembed_patterns = [
  87. OEmbedEndpointConfig(
  88. api_endpoint="http://publish.twitter.com/oembed",
  89. url_patterns=[
  90. re.compile(r"http://twitter\.com/.+/status/.+"),
  91. ],
  92. formats=None,
  93. ),
  94. OEmbedEndpointConfig(
  95. api_endpoint="http://www.hulu.com/api/oembed.{format}",
  96. url_patterns=[
  97. re.compile(r"http://www\.hulu\.com/watch/.+"),
  98. ],
  99. formats=["json"],
  100. ),
  101. ]
  102. return hs
  103. def prepare(self, reactor: MemoryReactor, clock: Clock, hs: HomeServer) -> None:
  104. self.media_repo = hs.get_media_repository()
  105. media_repo_resource = hs.get_media_repository_resource()
  106. self.preview_url = media_repo_resource.children[b"preview_url"]
  107. self.lookups: Dict[str, Any] = {}
  108. class Resolver:
  109. def resolveHostName(
  110. _self,
  111. resolutionReceiver: IResolutionReceiver,
  112. hostName: str,
  113. portNumber: int = 0,
  114. addressTypes: Optional[Sequence[Type[IAddress]]] = None,
  115. transportSemantics: str = "TCP",
  116. ) -> IResolutionReceiver:
  117. resolution = HostResolution(hostName)
  118. resolutionReceiver.resolutionBegan(resolution)
  119. if hostName not in self.lookups:
  120. raise DNSLookupError("OH NO")
  121. for i in self.lookups[hostName]:
  122. resolutionReceiver.addressResolved(i[0]("TCP", i[1], portNumber))
  123. resolutionReceiver.resolutionComplete()
  124. return resolutionReceiver
  125. self.reactor.nameResolver = Resolver() # type: ignore[assignment]
  126. def create_test_resource(self) -> MediaRepositoryResource:
  127. return self.hs.get_media_repository_resource()
  128. def _assert_small_png(self, json_body: JsonDict) -> None:
  129. """Assert properties from the SMALL_PNG test image."""
  130. self.assertTrue(json_body["og:image"].startswith("mxc://"))
  131. self.assertEqual(json_body["og:image:height"], 1)
  132. self.assertEqual(json_body["og:image:width"], 1)
  133. self.assertEqual(json_body["og:image:type"], "image/png")
  134. self.assertEqual(json_body["matrix:image:size"], 67)
  135. def test_cache_returns_correct_type(self) -> None:
  136. self.lookups["matrix.org"] = [(IPv4Address, "10.1.2.3")]
  137. channel = self.make_request(
  138. "GET",
  139. "preview_url?url=http://matrix.org",
  140. shorthand=False,
  141. await_result=False,
  142. )
  143. self.pump()
  144. client = self.reactor.tcpClients[0][2].buildProtocol(None)
  145. server = AccumulatingProtocol()
  146. server.makeConnection(FakeTransport(client, self.reactor))
  147. client.makeConnection(FakeTransport(server, self.reactor))
  148. client.dataReceived(
  149. b"HTTP/1.0 200 OK\r\nContent-Length: %d\r\nContent-Type: text/html\r\n\r\n"
  150. % (len(self.end_content),)
  151. + self.end_content
  152. )
  153. self.pump()
  154. self.assertEqual(channel.code, 200)
  155. self.assertEqual(
  156. channel.json_body, {"og:title": "~matrix~", "og:description": "hi"}
  157. )
  158. # Check the cache returns the correct response
  159. channel = self.make_request(
  160. "GET", "preview_url?url=http://matrix.org", shorthand=False
  161. )
  162. # Check the cache response has the same content
  163. self.assertEqual(channel.code, 200)
  164. self.assertEqual(
  165. channel.json_body, {"og:title": "~matrix~", "og:description": "hi"}
  166. )
  167. # Clear the in-memory cache
  168. self.assertIn("http://matrix.org", self.preview_url._url_previewer._cache)
  169. self.preview_url._url_previewer._cache.pop("http://matrix.org")
  170. self.assertNotIn("http://matrix.org", self.preview_url._url_previewer._cache)
  171. # Check the database cache returns the correct response
  172. channel = self.make_request(
  173. "GET", "preview_url?url=http://matrix.org", shorthand=False
  174. )
  175. # Check the cache response has the same content
  176. self.assertEqual(channel.code, 200)
  177. self.assertEqual(
  178. channel.json_body, {"og:title": "~matrix~", "og:description": "hi"}
  179. )
  180. def test_non_ascii_preview_httpequiv(self) -> None:
  181. self.lookups["matrix.org"] = [(IPv4Address, "10.1.2.3")]
  182. end_content = (
  183. b"<html><head>"
  184. b'<meta http-equiv="Content-Type" content="text/html; charset=windows-1251"/>'
  185. b'<meta property="og:title" content="\xe4\xea\xe0" />'
  186. b'<meta property="og:description" content="hi" />'
  187. b"</head></html>"
  188. )
  189. channel = self.make_request(
  190. "GET",
  191. "preview_url?url=http://matrix.org",
  192. shorthand=False,
  193. await_result=False,
  194. )
  195. self.pump()
  196. client = self.reactor.tcpClients[0][2].buildProtocol(None)
  197. server = AccumulatingProtocol()
  198. server.makeConnection(FakeTransport(client, self.reactor))
  199. client.makeConnection(FakeTransport(server, self.reactor))
  200. client.dataReceived(
  201. (
  202. b"HTTP/1.0 200 OK\r\nContent-Length: %d\r\n"
  203. b'Content-Type: text/html; charset="utf8"\r\n\r\n'
  204. )
  205. % (len(end_content),)
  206. + end_content
  207. )
  208. self.pump()
  209. self.assertEqual(channel.code, 200)
  210. self.assertEqual(channel.json_body["og:title"], "\u0434\u043a\u0430")
  211. def test_video_rejected(self) -> None:
  212. self.lookups["matrix.org"] = [(IPv4Address, "10.1.2.3")]
  213. end_content = b"anything"
  214. channel = self.make_request(
  215. "GET",
  216. "preview_url?url=http://matrix.org",
  217. shorthand=False,
  218. await_result=False,
  219. )
  220. self.pump()
  221. client = self.reactor.tcpClients[0][2].buildProtocol(None)
  222. server = AccumulatingProtocol()
  223. server.makeConnection(FakeTransport(client, self.reactor))
  224. client.makeConnection(FakeTransport(server, self.reactor))
  225. client.dataReceived(
  226. (
  227. b"HTTP/1.0 200 OK\r\nContent-Length: %d\r\n"
  228. b"Content-Type: video/mp4\r\n\r\n"
  229. )
  230. % (len(end_content))
  231. + end_content
  232. )
  233. self.pump()
  234. self.assertEqual(channel.code, 502)
  235. self.assertEqual(
  236. channel.json_body,
  237. {
  238. "errcode": "M_UNKNOWN",
  239. "error": "Requested file's content type not allowed for this operation: video/mp4",
  240. },
  241. )
  242. def test_audio_rejected(self) -> None:
  243. self.lookups["matrix.org"] = [(IPv4Address, "10.1.2.3")]
  244. end_content = b"anything"
  245. channel = self.make_request(
  246. "GET",
  247. "preview_url?url=http://matrix.org",
  248. shorthand=False,
  249. await_result=False,
  250. )
  251. self.pump()
  252. client = self.reactor.tcpClients[0][2].buildProtocol(None)
  253. server = AccumulatingProtocol()
  254. server.makeConnection(FakeTransport(client, self.reactor))
  255. client.makeConnection(FakeTransport(server, self.reactor))
  256. client.dataReceived(
  257. (
  258. b"HTTP/1.0 200 OK\r\nContent-Length: %d\r\n"
  259. b"Content-Type: audio/aac\r\n\r\n"
  260. )
  261. % (len(end_content))
  262. + end_content
  263. )
  264. self.pump()
  265. self.assertEqual(channel.code, 502)
  266. self.assertEqual(
  267. channel.json_body,
  268. {
  269. "errcode": "M_UNKNOWN",
  270. "error": "Requested file's content type not allowed for this operation: audio/aac",
  271. },
  272. )
  273. def test_non_ascii_preview_content_type(self) -> None:
  274. self.lookups["matrix.org"] = [(IPv4Address, "10.1.2.3")]
  275. end_content = (
  276. b"<html><head>"
  277. b'<meta property="og:title" content="\xe4\xea\xe0" />'
  278. b'<meta property="og:description" content="hi" />'
  279. b"</head></html>"
  280. )
  281. channel = self.make_request(
  282. "GET",
  283. "preview_url?url=http://matrix.org",
  284. shorthand=False,
  285. await_result=False,
  286. )
  287. self.pump()
  288. client = self.reactor.tcpClients[0][2].buildProtocol(None)
  289. server = AccumulatingProtocol()
  290. server.makeConnection(FakeTransport(client, self.reactor))
  291. client.makeConnection(FakeTransport(server, self.reactor))
  292. client.dataReceived(
  293. (
  294. b"HTTP/1.0 200 OK\r\nContent-Length: %d\r\n"
  295. b'Content-Type: text/html; charset="windows-1251"\r\n\r\n'
  296. )
  297. % (len(end_content),)
  298. + end_content
  299. )
  300. self.pump()
  301. self.assertEqual(channel.code, 200)
  302. self.assertEqual(channel.json_body["og:title"], "\u0434\u043a\u0430")
  303. def test_overlong_title(self) -> None:
  304. self.lookups["matrix.org"] = [(IPv4Address, "10.1.2.3")]
  305. end_content = (
  306. b"<html><head>"
  307. b"<title>" + b"x" * 2000 + b"</title>"
  308. b'<meta property="og:description" content="hi" />'
  309. b"</head></html>"
  310. )
  311. channel = self.make_request(
  312. "GET",
  313. "preview_url?url=http://matrix.org",
  314. shorthand=False,
  315. await_result=False,
  316. )
  317. self.pump()
  318. client = self.reactor.tcpClients[0][2].buildProtocol(None)
  319. server = AccumulatingProtocol()
  320. server.makeConnection(FakeTransport(client, self.reactor))
  321. client.makeConnection(FakeTransport(server, self.reactor))
  322. client.dataReceived(
  323. (
  324. b"HTTP/1.0 200 OK\r\nContent-Length: %d\r\n"
  325. b'Content-Type: text/html; charset="windows-1251"\r\n\r\n'
  326. )
  327. % (len(end_content),)
  328. + end_content
  329. )
  330. self.pump()
  331. self.assertEqual(channel.code, 200)
  332. res = channel.json_body
  333. # We should only see the `og:description` field, as `title` is too long and should be stripped out
  334. self.assertCountEqual(["og:description"], res.keys())
  335. def test_ipaddr(self) -> None:
  336. """
  337. IP addresses can be previewed directly.
  338. """
  339. self.lookups["example.com"] = [(IPv4Address, "10.1.2.3")]
  340. channel = self.make_request(
  341. "GET",
  342. "preview_url?url=http://example.com",
  343. shorthand=False,
  344. await_result=False,
  345. )
  346. self.pump()
  347. client = self.reactor.tcpClients[0][2].buildProtocol(None)
  348. server = AccumulatingProtocol()
  349. server.makeConnection(FakeTransport(client, self.reactor))
  350. client.makeConnection(FakeTransport(server, self.reactor))
  351. client.dataReceived(
  352. b"HTTP/1.0 200 OK\r\nContent-Length: %d\r\nContent-Type: text/html\r\n\r\n"
  353. % (len(self.end_content),)
  354. + self.end_content
  355. )
  356. self.pump()
  357. self.assertEqual(channel.code, 200)
  358. self.assertEqual(
  359. channel.json_body, {"og:title": "~matrix~", "og:description": "hi"}
  360. )
  361. def test_blocked_ip_specific(self) -> None:
  362. """
  363. Blocked IP addresses, found via DNS, are not spidered.
  364. """
  365. self.lookups["example.com"] = [(IPv4Address, "192.168.1.1")]
  366. channel = self.make_request(
  367. "GET", "preview_url?url=http://example.com", shorthand=False
  368. )
  369. # No requests made.
  370. self.assertEqual(len(self.reactor.tcpClients), 0)
  371. self.assertEqual(channel.code, 502)
  372. self.assertEqual(
  373. channel.json_body,
  374. {
  375. "errcode": "M_UNKNOWN",
  376. "error": "DNS resolution failure during URL preview generation",
  377. },
  378. )
  379. def test_blocked_ip_range(self) -> None:
  380. """
  381. Blocked IP ranges, IPs found over DNS, are not spidered.
  382. """
  383. self.lookups["example.com"] = [(IPv4Address, "1.1.1.2")]
  384. channel = self.make_request(
  385. "GET", "preview_url?url=http://example.com", shorthand=False
  386. )
  387. self.assertEqual(channel.code, 502)
  388. self.assertEqual(
  389. channel.json_body,
  390. {
  391. "errcode": "M_UNKNOWN",
  392. "error": "DNS resolution failure during URL preview generation",
  393. },
  394. )
  395. def test_blocked_ip_specific_direct(self) -> None:
  396. """
  397. Blocked IP addresses, accessed directly, are not spidered.
  398. """
  399. channel = self.make_request(
  400. "GET", "preview_url?url=http://192.168.1.1", shorthand=False
  401. )
  402. # No requests made.
  403. self.assertEqual(len(self.reactor.tcpClients), 0)
  404. self.assertEqual(
  405. channel.json_body,
  406. {"errcode": "M_UNKNOWN", "error": "IP address blocked"},
  407. )
  408. self.assertEqual(channel.code, 403)
  409. def test_blocked_ip_range_direct(self) -> None:
  410. """
  411. Blocked IP ranges, accessed directly, are not spidered.
  412. """
  413. channel = self.make_request(
  414. "GET", "preview_url?url=http://1.1.1.2", shorthand=False
  415. )
  416. self.assertEqual(channel.code, 403)
  417. self.assertEqual(
  418. channel.json_body,
  419. {"errcode": "M_UNKNOWN", "error": "IP address blocked"},
  420. )
  421. def test_blocked_ip_range_whitelisted_ip(self) -> None:
  422. """
  423. Blocked but then subsequently whitelisted IP addresses can be
  424. spidered.
  425. """
  426. self.lookups["example.com"] = [(IPv4Address, "1.1.1.1")]
  427. channel = self.make_request(
  428. "GET",
  429. "preview_url?url=http://example.com",
  430. shorthand=False,
  431. await_result=False,
  432. )
  433. self.pump()
  434. client = self.reactor.tcpClients[0][2].buildProtocol(None)
  435. server = AccumulatingProtocol()
  436. server.makeConnection(FakeTransport(client, self.reactor))
  437. client.makeConnection(FakeTransport(server, self.reactor))
  438. client.dataReceived(
  439. b"HTTP/1.0 200 OK\r\nContent-Length: %d\r\nContent-Type: text/html\r\n\r\n"
  440. % (len(self.end_content),)
  441. + self.end_content
  442. )
  443. self.pump()
  444. self.assertEqual(channel.code, 200)
  445. self.assertEqual(
  446. channel.json_body, {"og:title": "~matrix~", "og:description": "hi"}
  447. )
  448. def test_blocked_ip_with_external_ip(self) -> None:
  449. """
  450. If a hostname resolves a blocked IP, even if there's a non-blocked one,
  451. it will be rejected.
  452. """
  453. # Hardcode the URL resolving to the IP we want.
  454. self.lookups["example.com"] = [
  455. (IPv4Address, "1.1.1.2"),
  456. (IPv4Address, "10.1.2.3"),
  457. ]
  458. channel = self.make_request(
  459. "GET", "preview_url?url=http://example.com", shorthand=False
  460. )
  461. self.assertEqual(channel.code, 502)
  462. self.assertEqual(
  463. channel.json_body,
  464. {
  465. "errcode": "M_UNKNOWN",
  466. "error": "DNS resolution failure during URL preview generation",
  467. },
  468. )
  469. def test_blocked_ipv6_specific(self) -> None:
  470. """
  471. Blocked IP addresses, found via DNS, are not spidered.
  472. """
  473. self.lookups["example.com"] = [
  474. (IPv6Address, "3fff:ffff:ffff:ffff:ffff:ffff:ffff:ffff")
  475. ]
  476. channel = self.make_request(
  477. "GET", "preview_url?url=http://example.com", shorthand=False
  478. )
  479. # No requests made.
  480. self.assertEqual(len(self.reactor.tcpClients), 0)
  481. self.assertEqual(channel.code, 502)
  482. self.assertEqual(
  483. channel.json_body,
  484. {
  485. "errcode": "M_UNKNOWN",
  486. "error": "DNS resolution failure during URL preview generation",
  487. },
  488. )
  489. def test_blocked_ipv6_range(self) -> None:
  490. """
  491. Blocked IP ranges, IPs found over DNS, are not spidered.
  492. """
  493. self.lookups["example.com"] = [(IPv6Address, "2001:800::1")]
  494. channel = self.make_request(
  495. "GET", "preview_url?url=http://example.com", shorthand=False
  496. )
  497. self.assertEqual(channel.code, 502)
  498. self.assertEqual(
  499. channel.json_body,
  500. {
  501. "errcode": "M_UNKNOWN",
  502. "error": "DNS resolution failure during URL preview generation",
  503. },
  504. )
  505. def test_OPTIONS(self) -> None:
  506. """
  507. OPTIONS returns the OPTIONS.
  508. """
  509. channel = self.make_request(
  510. "OPTIONS", "preview_url?url=http://example.com", shorthand=False
  511. )
  512. self.assertEqual(channel.code, 200)
  513. self.assertEqual(channel.json_body, {})
  514. def test_accept_language_config_option(self) -> None:
  515. """
  516. Accept-Language header is sent to the remote server
  517. """
  518. self.lookups["example.com"] = [(IPv4Address, "10.1.2.3")]
  519. # Build and make a request to the server
  520. channel = self.make_request(
  521. "GET",
  522. "preview_url?url=http://example.com",
  523. shorthand=False,
  524. await_result=False,
  525. )
  526. self.pump()
  527. # Extract Synapse's tcp client
  528. client = self.reactor.tcpClients[0][2].buildProtocol(None)
  529. # Build a fake remote server to reply with
  530. server = AccumulatingProtocol()
  531. # Connect the two together
  532. server.makeConnection(FakeTransport(client, self.reactor))
  533. client.makeConnection(FakeTransport(server, self.reactor))
  534. # Tell Synapse that it has received some data from the remote server
  535. client.dataReceived(
  536. b"HTTP/1.0 200 OK\r\nContent-Length: %d\r\nContent-Type: text/html\r\n\r\n"
  537. % (len(self.end_content),)
  538. + self.end_content
  539. )
  540. # Move the reactor along until we get a response on our original channel
  541. self.pump()
  542. self.assertEqual(channel.code, 200)
  543. self.assertEqual(
  544. channel.json_body, {"og:title": "~matrix~", "og:description": "hi"}
  545. )
  546. # Check that the server received the Accept-Language header as part
  547. # of the request from Synapse
  548. self.assertIn(
  549. (
  550. b"Accept-Language: en-UK\r\n"
  551. b"Accept-Language: en-US;q=0.9\r\n"
  552. b"Accept-Language: fr;q=0.8\r\n"
  553. b"Accept-Language: *;q=0.7"
  554. ),
  555. server.data,
  556. )
  557. def test_image(self) -> None:
  558. """An image should be precached if mentioned in the HTML."""
  559. self.lookups["matrix.org"] = [(IPv4Address, "10.1.2.3")]
  560. self.lookups["cdn.matrix.org"] = [(IPv4Address, "10.1.2.4")]
  561. result = (
  562. b"""<html><body><img src="http://cdn.matrix.org/foo.png"></body></html>"""
  563. )
  564. channel = self.make_request(
  565. "GET",
  566. "preview_url?url=http://matrix.org",
  567. shorthand=False,
  568. await_result=False,
  569. )
  570. self.pump()
  571. # Respond with the HTML.
  572. client = self.reactor.tcpClients[0][2].buildProtocol(None)
  573. server = AccumulatingProtocol()
  574. server.makeConnection(FakeTransport(client, self.reactor))
  575. client.makeConnection(FakeTransport(server, self.reactor))
  576. client.dataReceived(
  577. (
  578. b"HTTP/1.0 200 OK\r\nContent-Length: %d\r\n"
  579. b'Content-Type: text/html; charset="utf8"\r\n\r\n'
  580. )
  581. % (len(result),)
  582. + result
  583. )
  584. self.pump()
  585. # Respond with the photo.
  586. client = self.reactor.tcpClients[1][2].buildProtocol(None)
  587. server = AccumulatingProtocol()
  588. server.makeConnection(FakeTransport(client, self.reactor))
  589. client.makeConnection(FakeTransport(server, self.reactor))
  590. client.dataReceived(
  591. (
  592. b"HTTP/1.0 200 OK\r\nContent-Length: %d\r\n"
  593. b"Content-Type: image/png\r\n\r\n"
  594. )
  595. % (len(SMALL_PNG),)
  596. + SMALL_PNG
  597. )
  598. self.pump()
  599. # The image should be in the result.
  600. self.assertEqual(channel.code, 200)
  601. self._assert_small_png(channel.json_body)
  602. def test_nonexistent_image(self) -> None:
  603. """If the preview image doesn't exist, ensure some data is returned."""
  604. self.lookups["matrix.org"] = [(IPv4Address, "10.1.2.3")]
  605. result = (
  606. b"""<html><body><img src="http://cdn.matrix.org/foo.jpg"></body></html>"""
  607. )
  608. channel = self.make_request(
  609. "GET",
  610. "preview_url?url=http://matrix.org",
  611. shorthand=False,
  612. await_result=False,
  613. )
  614. self.pump()
  615. client = self.reactor.tcpClients[0][2].buildProtocol(None)
  616. server = AccumulatingProtocol()
  617. server.makeConnection(FakeTransport(client, self.reactor))
  618. client.makeConnection(FakeTransport(server, self.reactor))
  619. client.dataReceived(
  620. (
  621. b"HTTP/1.0 200 OK\r\nContent-Length: %d\r\n"
  622. b'Content-Type: text/html; charset="utf8"\r\n\r\n'
  623. )
  624. % (len(result),)
  625. + result
  626. )
  627. self.pump()
  628. # There should not be a second connection.
  629. self.assertEqual(len(self.reactor.tcpClients), 1)
  630. # The image should not be in the result.
  631. self.assertEqual(channel.code, 200)
  632. self.assertNotIn("og:image", channel.json_body)
  633. @unittest.override_config(
  634. {"url_preview_url_blacklist": [{"netloc": "cdn.matrix.org"}]}
  635. )
  636. def test_image_blocked(self) -> None:
  637. """If the preview image doesn't exist, ensure some data is returned."""
  638. self.lookups["matrix.org"] = [(IPv4Address, "10.1.2.3")]
  639. self.lookups["cdn.matrix.org"] = [(IPv4Address, "10.1.2.4")]
  640. result = (
  641. b"""<html><body><img src="http://cdn.matrix.org/foo.jpg"></body></html>"""
  642. )
  643. channel = self.make_request(
  644. "GET",
  645. "preview_url?url=http://matrix.org",
  646. shorthand=False,
  647. await_result=False,
  648. )
  649. self.pump()
  650. client = self.reactor.tcpClients[0][2].buildProtocol(None)
  651. server = AccumulatingProtocol()
  652. server.makeConnection(FakeTransport(client, self.reactor))
  653. client.makeConnection(FakeTransport(server, self.reactor))
  654. client.dataReceived(
  655. (
  656. b"HTTP/1.0 200 OK\r\nContent-Length: %d\r\n"
  657. b'Content-Type: text/html; charset="utf8"\r\n\r\n'
  658. )
  659. % (len(result),)
  660. + result
  661. )
  662. self.pump()
  663. # There should not be a second connection.
  664. self.assertEqual(len(self.reactor.tcpClients), 1)
  665. # The image should not be in the result.
  666. self.assertEqual(channel.code, 200)
  667. self.assertNotIn("og:image", channel.json_body)
  668. def test_oembed_failure(self) -> None:
  669. """If the autodiscovered oEmbed URL fails, ensure some data is returned."""
  670. self.lookups["matrix.org"] = [(IPv4Address, "10.1.2.3")]
  671. result = b"""
  672. <title>oEmbed Autodiscovery Fail</title>
  673. <link rel="alternate" type="application/json+oembed"
  674. href="http://example.com/oembed?url=http%3A%2F%2Fmatrix.org&format=json"
  675. title="matrixdotorg" />
  676. """
  677. channel = self.make_request(
  678. "GET",
  679. "preview_url?url=http://matrix.org",
  680. shorthand=False,
  681. await_result=False,
  682. )
  683. self.pump()
  684. client = self.reactor.tcpClients[0][2].buildProtocol(None)
  685. server = AccumulatingProtocol()
  686. server.makeConnection(FakeTransport(client, self.reactor))
  687. client.makeConnection(FakeTransport(server, self.reactor))
  688. client.dataReceived(
  689. (
  690. b"HTTP/1.0 200 OK\r\nContent-Length: %d\r\n"
  691. b'Content-Type: text/html; charset="utf8"\r\n\r\n'
  692. )
  693. % (len(result),)
  694. + result
  695. )
  696. self.pump()
  697. self.assertEqual(channel.code, 200)
  698. # The image should not be in the result.
  699. self.assertEqual(channel.json_body["og:title"], "oEmbed Autodiscovery Fail")
  700. def test_data_url(self) -> None:
  701. """
  702. Requesting to preview a data URL is not supported.
  703. """
  704. self.lookups["matrix.org"] = [(IPv4Address, "10.1.2.3")]
  705. data = base64.b64encode(SMALL_PNG).decode()
  706. query_params = urlencode(
  707. {
  708. "url": f'<html><head><img src="data:image/png;base64,{data}" /></head></html>'
  709. }
  710. )
  711. channel = self.make_request(
  712. "GET",
  713. f"preview_url?{query_params}",
  714. shorthand=False,
  715. )
  716. self.pump()
  717. self.assertEqual(channel.code, 500)
  718. def test_inline_data_url(self) -> None:
  719. """
  720. An inline image (as a data URL) should be parsed properly.
  721. """
  722. self.lookups["matrix.org"] = [(IPv4Address, "10.1.2.3")]
  723. data = base64.b64encode(SMALL_PNG)
  724. end_content = (
  725. b"<html><head>" b'<img src="data:image/png;base64,%s" />' b"</head></html>"
  726. ) % (data,)
  727. channel = self.make_request(
  728. "GET",
  729. "preview_url?url=http://matrix.org",
  730. shorthand=False,
  731. await_result=False,
  732. )
  733. self.pump()
  734. client = self.reactor.tcpClients[0][2].buildProtocol(None)
  735. server = AccumulatingProtocol()
  736. server.makeConnection(FakeTransport(client, self.reactor))
  737. client.makeConnection(FakeTransport(server, self.reactor))
  738. client.dataReceived(
  739. (
  740. b"HTTP/1.0 200 OK\r\nContent-Length: %d\r\n"
  741. b'Content-Type: text/html; charset="utf8"\r\n\r\n'
  742. )
  743. % (len(end_content),)
  744. + end_content
  745. )
  746. self.pump()
  747. self.assertEqual(channel.code, 200)
  748. self._assert_small_png(channel.json_body)
  749. def test_oembed_photo(self) -> None:
  750. """Test an oEmbed endpoint which returns a 'photo' type which redirects the preview to a new URL."""
  751. self.lookups["publish.twitter.com"] = [(IPv4Address, "10.1.2.3")]
  752. self.lookups["cdn.twitter.com"] = [(IPv4Address, "10.1.2.3")]
  753. result = {
  754. "version": "1.0",
  755. "type": "photo",
  756. "url": "http://cdn.twitter.com/matrixdotorg",
  757. }
  758. oembed_content = json.dumps(result).encode("utf-8")
  759. channel = self.make_request(
  760. "GET",
  761. "preview_url?url=http://twitter.com/matrixdotorg/status/12345",
  762. shorthand=False,
  763. await_result=False,
  764. )
  765. self.pump()
  766. client = self.reactor.tcpClients[0][2].buildProtocol(None)
  767. server = AccumulatingProtocol()
  768. server.makeConnection(FakeTransport(client, self.reactor))
  769. client.makeConnection(FakeTransport(server, self.reactor))
  770. client.dataReceived(
  771. (
  772. b"HTTP/1.0 200 OK\r\nContent-Length: %d\r\n"
  773. b'Content-Type: application/json; charset="utf8"\r\n\r\n'
  774. )
  775. % (len(oembed_content),)
  776. + oembed_content
  777. )
  778. self.pump()
  779. # Ensure a second request is made to the photo URL.
  780. client = self.reactor.tcpClients[1][2].buildProtocol(None)
  781. server = AccumulatingProtocol()
  782. server.makeConnection(FakeTransport(client, self.reactor))
  783. client.makeConnection(FakeTransport(server, self.reactor))
  784. client.dataReceived(
  785. (
  786. b"HTTP/1.0 200 OK\r\nContent-Length: %d\r\n"
  787. b"Content-Type: image/png\r\n\r\n"
  788. )
  789. % (len(SMALL_PNG),)
  790. + SMALL_PNG
  791. )
  792. self.pump()
  793. # Ensure the URL is what was requested.
  794. self.assertIn(b"/matrixdotorg", server.data)
  795. self.assertEqual(channel.code, 200)
  796. body = channel.json_body
  797. self.assertEqual(body["og:url"], "http://twitter.com/matrixdotorg/status/12345")
  798. self._assert_small_png(body)
  799. def test_oembed_rich(self) -> None:
  800. """Test an oEmbed endpoint which returns HTML content via the 'rich' type."""
  801. self.lookups["publish.twitter.com"] = [(IPv4Address, "10.1.2.3")]
  802. result = {
  803. "version": "1.0",
  804. "type": "rich",
  805. # Note that this provides the author, not the title.
  806. "author_name": "Alice",
  807. "html": "<div>Content Preview</div>",
  808. }
  809. end_content = json.dumps(result).encode("utf-8")
  810. channel = self.make_request(
  811. "GET",
  812. "preview_url?url=http://twitter.com/matrixdotorg/status/12345",
  813. shorthand=False,
  814. await_result=False,
  815. )
  816. self.pump()
  817. client = self.reactor.tcpClients[0][2].buildProtocol(None)
  818. server = AccumulatingProtocol()
  819. server.makeConnection(FakeTransport(client, self.reactor))
  820. client.makeConnection(FakeTransport(server, self.reactor))
  821. client.dataReceived(
  822. (
  823. b"HTTP/1.0 200 OK\r\nContent-Length: %d\r\n"
  824. b'Content-Type: application/json; charset="utf8"\r\n\r\n'
  825. )
  826. % (len(end_content),)
  827. + end_content
  828. )
  829. self.pump()
  830. # Double check that the proper host is being connected to. (Note that
  831. # twitter.com can't be resolved so this is already implicitly checked.)
  832. self.assertIn(b"\r\nHost: publish.twitter.com\r\n", server.data)
  833. self.assertEqual(channel.code, 200)
  834. body = channel.json_body
  835. self.assertEqual(
  836. body,
  837. {
  838. "og:url": "http://twitter.com/matrixdotorg/status/12345",
  839. "og:title": "Alice",
  840. "og:description": "Content Preview",
  841. },
  842. )
  843. def test_oembed_format(self) -> None:
  844. """Test an oEmbed endpoint which requires the format in the URL."""
  845. self.lookups["www.hulu.com"] = [(IPv4Address, "10.1.2.3")]
  846. result = {
  847. "version": "1.0",
  848. "type": "rich",
  849. "html": "<div>Content Preview</div>",
  850. }
  851. end_content = json.dumps(result).encode("utf-8")
  852. channel = self.make_request(
  853. "GET",
  854. "preview_url?url=http://www.hulu.com/watch/12345",
  855. shorthand=False,
  856. await_result=False,
  857. )
  858. self.pump()
  859. client = self.reactor.tcpClients[0][2].buildProtocol(None)
  860. server = AccumulatingProtocol()
  861. server.makeConnection(FakeTransport(client, self.reactor))
  862. client.makeConnection(FakeTransport(server, self.reactor))
  863. client.dataReceived(
  864. (
  865. b"HTTP/1.0 200 OK\r\nContent-Length: %d\r\n"
  866. b'Content-Type: application/json; charset="utf8"\r\n\r\n'
  867. )
  868. % (len(end_content),)
  869. + end_content
  870. )
  871. self.pump()
  872. # The {format} should have been turned into json.
  873. self.assertIn(b"/api/oembed.json", server.data)
  874. # A URL parameter of format=json should be provided.
  875. self.assertIn(b"format=json", server.data)
  876. self.assertEqual(channel.code, 200)
  877. body = channel.json_body
  878. self.assertEqual(
  879. body,
  880. {
  881. "og:url": "http://www.hulu.com/watch/12345",
  882. "og:description": "Content Preview",
  883. },
  884. )
  885. @unittest.override_config(
  886. {"url_preview_url_blacklist": [{"netloc": "publish.twitter.com"}]}
  887. )
  888. def test_oembed_blocked(self) -> None:
  889. """The oEmbed URL should not be downloaded if the oEmbed URL is blocked."""
  890. self.lookups["twitter.com"] = [(IPv4Address, "10.1.2.3")]
  891. channel = self.make_request(
  892. "GET",
  893. "preview_url?url=http://twitter.com/matrixdotorg/status/12345",
  894. shorthand=False,
  895. await_result=False,
  896. )
  897. self.pump()
  898. self.assertEqual(channel.code, 403, channel.result)
  899. def test_oembed_autodiscovery(self) -> None:
  900. """
  901. Autodiscovery works by finding the link in the HTML response and then requesting an oEmbed URL.
  902. 1. Request a preview of a URL which is not known to the oEmbed code.
  903. 2. It returns HTML including a link to an oEmbed preview.
  904. 3. The oEmbed preview is requested and returns a URL for an image.
  905. 4. The image is requested for thumbnailing.
  906. """
  907. # This is a little cheesy in that we use the www subdomain (which isn't the
  908. # list of oEmbed patterns) to get "raw" HTML response.
  909. self.lookups["www.twitter.com"] = [(IPv4Address, "10.1.2.3")]
  910. self.lookups["publish.twitter.com"] = [(IPv4Address, "10.1.2.3")]
  911. self.lookups["cdn.twitter.com"] = [(IPv4Address, "10.1.2.3")]
  912. result = b"""
  913. <link rel="alternate" type="application/json+oembed"
  914. href="http://publish.twitter.com/oembed?url=http%3A%2F%2Fcdn.twitter.com%2Fmatrixdotorg%2Fstatus%2F12345&format=json"
  915. title="matrixdotorg" />
  916. """
  917. channel = self.make_request(
  918. "GET",
  919. "preview_url?url=http://www.twitter.com/matrixdotorg/status/12345",
  920. shorthand=False,
  921. await_result=False,
  922. )
  923. self.pump()
  924. client = self.reactor.tcpClients[0][2].buildProtocol(None)
  925. server = AccumulatingProtocol()
  926. server.makeConnection(FakeTransport(client, self.reactor))
  927. client.makeConnection(FakeTransport(server, self.reactor))
  928. client.dataReceived(
  929. (
  930. b"HTTP/1.0 200 OK\r\nContent-Length: %d\r\n"
  931. b'Content-Type: text/html; charset="utf8"\r\n\r\n'
  932. )
  933. % (len(result),)
  934. + result
  935. )
  936. self.pump()
  937. # The oEmbed response.
  938. result2 = {
  939. "version": "1.0",
  940. "type": "photo",
  941. "url": "http://cdn.twitter.com/matrixdotorg",
  942. }
  943. oembed_content = json.dumps(result2).encode("utf-8")
  944. # Ensure a second request is made to the oEmbed URL.
  945. client = self.reactor.tcpClients[1][2].buildProtocol(None)
  946. server = AccumulatingProtocol()
  947. server.makeConnection(FakeTransport(client, self.reactor))
  948. client.makeConnection(FakeTransport(server, self.reactor))
  949. client.dataReceived(
  950. (
  951. b"HTTP/1.0 200 OK\r\nContent-Length: %d\r\n"
  952. b'Content-Type: application/json; charset="utf8"\r\n\r\n'
  953. )
  954. % (len(oembed_content),)
  955. + oembed_content
  956. )
  957. self.pump()
  958. # Ensure the URL is what was requested.
  959. self.assertIn(b"/oembed?", server.data)
  960. # Ensure a third request is made to the photo URL.
  961. client = self.reactor.tcpClients[2][2].buildProtocol(None)
  962. server = AccumulatingProtocol()
  963. server.makeConnection(FakeTransport(client, self.reactor))
  964. client.makeConnection(FakeTransport(server, self.reactor))
  965. client.dataReceived(
  966. (
  967. b"HTTP/1.0 200 OK\r\nContent-Length: %d\r\n"
  968. b"Content-Type: image/png\r\n\r\n"
  969. )
  970. % (len(SMALL_PNG),)
  971. + SMALL_PNG
  972. )
  973. self.pump()
  974. # Ensure the URL is what was requested.
  975. self.assertIn(b"/matrixdotorg", server.data)
  976. self.assertEqual(channel.code, 200)
  977. body = channel.json_body
  978. self.assertEqual(
  979. body["og:url"], "http://www.twitter.com/matrixdotorg/status/12345"
  980. )
  981. self._assert_small_png(body)
  982. @unittest.override_config(
  983. {"url_preview_url_blacklist": [{"netloc": "publish.twitter.com"}]}
  984. )
  985. def test_oembed_autodiscovery_blocked(self) -> None:
  986. """
  987. If the discovered oEmbed URL is blocked, it should be discarded.
  988. """
  989. # This is a little cheesy in that we use the www subdomain (which isn't the
  990. # list of oEmbed patterns) to get "raw" HTML response.
  991. self.lookups["www.twitter.com"] = [(IPv4Address, "10.1.2.3")]
  992. self.lookups["publish.twitter.com"] = [(IPv4Address, "10.1.2.4")]
  993. result = b"""
  994. <title>Test</title>
  995. <link rel="alternate" type="application/json+oembed"
  996. href="http://publish.twitter.com/oembed?url=http%3A%2F%2Fcdn.twitter.com%2Fmatrixdotorg%2Fstatus%2F12345&format=json"
  997. title="matrixdotorg" />
  998. """
  999. channel = self.make_request(
  1000. "GET",
  1001. "preview_url?url=http://www.twitter.com/matrixdotorg/status/12345",
  1002. shorthand=False,
  1003. await_result=False,
  1004. )
  1005. self.pump()
  1006. client = self.reactor.tcpClients[0][2].buildProtocol(None)
  1007. server = AccumulatingProtocol()
  1008. server.makeConnection(FakeTransport(client, self.reactor))
  1009. client.makeConnection(FakeTransport(server, self.reactor))
  1010. client.dataReceived(
  1011. (
  1012. b"HTTP/1.0 200 OK\r\nContent-Length: %d\r\n"
  1013. b'Content-Type: text/html; charset="utf8"\r\n\r\n'
  1014. )
  1015. % (len(result),)
  1016. + result
  1017. )
  1018. self.pump()
  1019. # Ensure there's no additional connections.
  1020. self.assertEqual(len(self.reactor.tcpClients), 1)
  1021. # Ensure the URL is what was requested.
  1022. self.assertIn(b"\r\nHost: www.twitter.com\r\n", server.data)
  1023. self.assertEqual(channel.code, 200)
  1024. body = channel.json_body
  1025. self.assertEqual(body["og:title"], "Test")
  1026. self.assertNotIn("og:image", body)
  1027. def _download_image(self) -> Tuple[str, str]:
  1028. """Downloads an image into the URL cache.
  1029. Returns:
  1030. A (host, media_id) tuple representing the MXC URI of the image.
  1031. """
  1032. self.lookups["cdn.twitter.com"] = [(IPv4Address, "10.1.2.3")]
  1033. channel = self.make_request(
  1034. "GET",
  1035. "preview_url?url=http://cdn.twitter.com/matrixdotorg",
  1036. shorthand=False,
  1037. await_result=False,
  1038. )
  1039. self.pump()
  1040. client = self.reactor.tcpClients[0][2].buildProtocol(None)
  1041. server = AccumulatingProtocol()
  1042. server.makeConnection(FakeTransport(client, self.reactor))
  1043. client.makeConnection(FakeTransport(server, self.reactor))
  1044. client.dataReceived(
  1045. b"HTTP/1.0 200 OK\r\nContent-Length: %d\r\nContent-Type: image/png\r\n\r\n"
  1046. % (len(SMALL_PNG),)
  1047. + SMALL_PNG
  1048. )
  1049. self.pump()
  1050. self.assertEqual(channel.code, 200)
  1051. body = channel.json_body
  1052. mxc_uri = body["og:image"]
  1053. host, _port, media_id = parse_and_validate_mxc_uri(mxc_uri)
  1054. self.assertIsNone(_port)
  1055. return host, media_id
  1056. def test_storage_providers_exclude_files(self) -> None:
  1057. """Test that files are not stored in or fetched from storage providers."""
  1058. host, media_id = self._download_image()
  1059. rel_file_path = self.media_repo.filepaths.url_cache_filepath_rel(media_id)
  1060. media_store_path = os.path.join(self.media_store_path, rel_file_path)
  1061. storage_provider_path = os.path.join(self.storage_path, rel_file_path)
  1062. # Check storage
  1063. self.assertTrue(os.path.isfile(media_store_path))
  1064. self.assertFalse(
  1065. os.path.isfile(storage_provider_path),
  1066. "URL cache file was unexpectedly stored in a storage provider",
  1067. )
  1068. # Check fetching
  1069. channel = self.make_request(
  1070. "GET",
  1071. f"download/{host}/{media_id}",
  1072. shorthand=False,
  1073. await_result=False,
  1074. )
  1075. self.pump()
  1076. self.assertEqual(channel.code, 200)
  1077. # Move cached file into the storage provider
  1078. os.makedirs(os.path.dirname(storage_provider_path), exist_ok=True)
  1079. os.rename(media_store_path, storage_provider_path)
  1080. channel = self.make_request(
  1081. "GET",
  1082. f"download/{host}/{media_id}",
  1083. shorthand=False,
  1084. await_result=False,
  1085. )
  1086. self.pump()
  1087. self.assertEqual(
  1088. channel.code,
  1089. 404,
  1090. "URL cache file was unexpectedly retrieved from a storage provider",
  1091. )
  1092. def test_storage_providers_exclude_thumbnails(self) -> None:
  1093. """Test that thumbnails are not stored in or fetched from storage providers."""
  1094. host, media_id = self._download_image()
  1095. rel_thumbnail_path = (
  1096. self.media_repo.filepaths.url_cache_thumbnail_directory_rel(media_id)
  1097. )
  1098. media_store_thumbnail_path = os.path.join(
  1099. self.media_store_path, rel_thumbnail_path
  1100. )
  1101. storage_provider_thumbnail_path = os.path.join(
  1102. self.storage_path, rel_thumbnail_path
  1103. )
  1104. # Check storage
  1105. self.assertTrue(os.path.isdir(media_store_thumbnail_path))
  1106. self.assertFalse(
  1107. os.path.isdir(storage_provider_thumbnail_path),
  1108. "URL cache thumbnails were unexpectedly stored in a storage provider",
  1109. )
  1110. # Check fetching
  1111. channel = self.make_request(
  1112. "GET",
  1113. f"thumbnail/{host}/{media_id}?width=32&height=32&method=scale",
  1114. shorthand=False,
  1115. await_result=False,
  1116. )
  1117. self.pump()
  1118. self.assertEqual(channel.code, 200)
  1119. # Remove the original, otherwise thumbnails will regenerate
  1120. rel_file_path = self.media_repo.filepaths.url_cache_filepath_rel(media_id)
  1121. media_store_path = os.path.join(self.media_store_path, rel_file_path)
  1122. os.remove(media_store_path)
  1123. # Move cached thumbnails into the storage provider
  1124. os.makedirs(os.path.dirname(storage_provider_thumbnail_path), exist_ok=True)
  1125. os.rename(media_store_thumbnail_path, storage_provider_thumbnail_path)
  1126. channel = self.make_request(
  1127. "GET",
  1128. f"thumbnail/{host}/{media_id}?width=32&height=32&method=scale",
  1129. shorthand=False,
  1130. await_result=False,
  1131. )
  1132. self.pump()
  1133. self.assertEqual(
  1134. channel.code,
  1135. 404,
  1136. "URL cache thumbnail was unexpectedly retrieved from a storage provider",
  1137. )
  1138. def test_cache_expiry(self) -> None:
  1139. """Test that URL cache files and thumbnails are cleaned up properly on expiry."""
  1140. _host, media_id = self._download_image()
  1141. file_path = self.media_repo.filepaths.url_cache_filepath(media_id)
  1142. file_dirs = self.media_repo.filepaths.url_cache_filepath_dirs_to_delete(
  1143. media_id
  1144. )
  1145. thumbnail_dir = self.media_repo.filepaths.url_cache_thumbnail_directory(
  1146. media_id
  1147. )
  1148. thumbnail_dirs = self.media_repo.filepaths.url_cache_thumbnail_dirs_to_delete(
  1149. media_id
  1150. )
  1151. self.assertTrue(os.path.isfile(file_path))
  1152. self.assertTrue(os.path.isdir(thumbnail_dir))
  1153. self.reactor.advance(IMAGE_CACHE_EXPIRY_MS * 1000 + 1)
  1154. self.get_success(self.preview_url._url_previewer._expire_url_cache_data())
  1155. for path in [file_path] + file_dirs + [thumbnail_dir] + thumbnail_dirs:
  1156. self.assertFalse(
  1157. os.path.exists(path),
  1158. f"{os.path.relpath(path, self.media_store_path)} was not deleted",
  1159. )
  1160. @unittest.override_config({"url_preview_url_blacklist": [{"port": "*"}]})
  1161. def test_blocked_port(self) -> None:
  1162. """Tests that blocking URLs with a port makes previewing such URLs
  1163. fail with a 403 error and doesn't impact other previews.
  1164. """
  1165. self.lookups["matrix.org"] = [(IPv4Address, "10.1.2.3")]
  1166. bad_url = quote("http://matrix.org:8888/foo")
  1167. good_url = quote("http://matrix.org/foo")
  1168. channel = self.make_request(
  1169. "GET",
  1170. "preview_url?url=" + bad_url,
  1171. shorthand=False,
  1172. await_result=False,
  1173. )
  1174. self.pump()
  1175. self.assertEqual(channel.code, 403, channel.result)
  1176. channel = self.make_request(
  1177. "GET",
  1178. "preview_url?url=" + good_url,
  1179. shorthand=False,
  1180. await_result=False,
  1181. )
  1182. self.pump()
  1183. client = self.reactor.tcpClients[0][2].buildProtocol(None)
  1184. server = AccumulatingProtocol()
  1185. server.makeConnection(FakeTransport(client, self.reactor))
  1186. client.makeConnection(FakeTransport(server, self.reactor))
  1187. client.dataReceived(
  1188. b"HTTP/1.0 200 OK\r\nContent-Length: %d\r\nContent-Type: text/html\r\n\r\n"
  1189. % (len(self.end_content),)
  1190. + self.end_content
  1191. )
  1192. self.pump()
  1193. self.assertEqual(channel.code, 200)
  1194. @unittest.override_config(
  1195. {"url_preview_url_blacklist": [{"netloc": "example.com"}]}
  1196. )
  1197. def test_blocked_url(self) -> None:
  1198. """Tests that blocking URLs with a host makes previewing such URLs
  1199. fail with a 403 error.
  1200. """
  1201. self.lookups["example.com"] = [(IPv4Address, "10.1.2.3")]
  1202. bad_url = quote("http://example.com/foo")
  1203. channel = self.make_request(
  1204. "GET",
  1205. "preview_url?url=" + bad_url,
  1206. shorthand=False,
  1207. await_result=False,
  1208. )
  1209. self.pump()
  1210. self.assertEqual(channel.code, 403, channel.result)