test_url_preview.py 33 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983
  1. # Copyright 2018 New Vector Ltd
  2. # Copyright 2021 The Matrix.org Foundation C.I.C.
  3. #
  4. # Licensed under the Apache License, Version 2.0 (the "License");
  5. # you may not use this file except in compliance with the License.
  6. # You may obtain a copy of the License at
  7. #
  8. # http://www.apache.org/licenses/LICENSE-2.0
  9. #
  10. # Unless required by applicable law or agreed to in writing, software
  11. # distributed under the License is distributed on an "AS IS" BASIS,
  12. # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  13. # See the License for the specific language governing permissions and
  14. # limitations under the License.
  15. import json
  16. import os
  17. import re
  18. from twisted.internet._resolver import HostResolution
  19. from twisted.internet.address import IPv4Address, IPv6Address
  20. from twisted.internet.error import DNSLookupError
  21. from twisted.test.proto_helpers import AccumulatingProtocol
  22. from synapse.config.oembed import OEmbedEndpointConfig
  23. from synapse.rest.media.v1.preview_url_resource import IMAGE_CACHE_EXPIRY_MS
  24. from synapse.util.stringutils import parse_and_validate_mxc_uri
  25. from tests import unittest
  26. from tests.server import FakeTransport
  27. from tests.test_utils import SMALL_PNG
  28. from tests.utils import MockClock
  29. try:
  30. import lxml
  31. except ImportError:
  32. lxml = None
  33. class URLPreviewTests(unittest.HomeserverTestCase):
  34. if not lxml:
  35. skip = "url preview feature requires lxml"
  36. hijack_auth = True
  37. user_id = "@test:user"
  38. end_content = (
  39. b"<html><head>"
  40. b'<meta property="og:title" content="~matrix~" />'
  41. b'<meta property="og:description" content="hi" />'
  42. b"</head></html>"
  43. )
  44. def make_homeserver(self, reactor, clock):
  45. config = self.default_config()
  46. config["url_preview_enabled"] = True
  47. config["max_spider_size"] = 9999999
  48. config["url_preview_ip_range_blacklist"] = (
  49. "192.168.1.1",
  50. "1.0.0.0/8",
  51. "3fff:ffff:ffff:ffff:ffff:ffff:ffff:ffff",
  52. "2001:800::/21",
  53. )
  54. config["url_preview_ip_range_whitelist"] = ("1.1.1.1",)
  55. config["url_preview_url_blacklist"] = []
  56. config["url_preview_accept_language"] = [
  57. "en-UK",
  58. "en-US;q=0.9",
  59. "fr;q=0.8",
  60. "*;q=0.7",
  61. ]
  62. self.storage_path = self.mktemp()
  63. self.media_store_path = self.mktemp()
  64. os.mkdir(self.storage_path)
  65. os.mkdir(self.media_store_path)
  66. config["media_store_path"] = self.media_store_path
  67. provider_config = {
  68. "module": "synapse.rest.media.v1.storage_provider.FileStorageProviderBackend",
  69. "store_local": True,
  70. "store_synchronous": False,
  71. "store_remote": True,
  72. "config": {"directory": self.storage_path},
  73. }
  74. config["media_storage_providers"] = [provider_config]
  75. hs = self.setup_test_homeserver(config=config)
  76. # After the hs is created, modify the parsed oEmbed config (to avoid
  77. # messing with files).
  78. #
  79. # Note that HTTP URLs are used to avoid having to deal with TLS in tests.
  80. hs.config.oembed.oembed_patterns = [
  81. OEmbedEndpointConfig(
  82. api_endpoint="http://publish.twitter.com/oembed",
  83. url_patterns=[
  84. re.compile(r"http://twitter\.com/.+/status/.+"),
  85. ],
  86. formats=None,
  87. ),
  88. OEmbedEndpointConfig(
  89. api_endpoint="http://www.hulu.com/api/oembed.{format}",
  90. url_patterns=[
  91. re.compile(r"http://www\.hulu\.com/watch/.+"),
  92. ],
  93. formats=["json"],
  94. ),
  95. ]
  96. return hs
  97. def prepare(self, reactor, clock, hs):
  98. self.media_repo = hs.get_media_repository_resource()
  99. self.preview_url = self.media_repo.children[b"preview_url"]
  100. self.lookups = {}
  101. class Resolver:
  102. def resolveHostName(
  103. _self,
  104. resolutionReceiver,
  105. hostName,
  106. portNumber=0,
  107. addressTypes=None,
  108. transportSemantics="TCP",
  109. ):
  110. resolution = HostResolution(hostName)
  111. resolutionReceiver.resolutionBegan(resolution)
  112. if hostName not in self.lookups:
  113. raise DNSLookupError("OH NO")
  114. for i in self.lookups[hostName]:
  115. resolutionReceiver.addressResolved(i[0]("TCP", i[1], portNumber))
  116. resolutionReceiver.resolutionComplete()
  117. return resolutionReceiver
  118. self.reactor.nameResolver = Resolver()
  119. def create_test_resource(self):
  120. return self.hs.get_media_repository_resource()
  121. def test_cache_returns_correct_type(self):
  122. self.lookups["matrix.org"] = [(IPv4Address, "10.1.2.3")]
  123. channel = self.make_request(
  124. "GET",
  125. "preview_url?url=http://matrix.org",
  126. shorthand=False,
  127. await_result=False,
  128. )
  129. self.pump()
  130. client = self.reactor.tcpClients[0][2].buildProtocol(None)
  131. server = AccumulatingProtocol()
  132. server.makeConnection(FakeTransport(client, self.reactor))
  133. client.makeConnection(FakeTransport(server, self.reactor))
  134. client.dataReceived(
  135. b"HTTP/1.0 200 OK\r\nContent-Length: %d\r\nContent-Type: text/html\r\n\r\n"
  136. % (len(self.end_content),)
  137. + self.end_content
  138. )
  139. self.pump()
  140. self.assertEqual(channel.code, 200)
  141. self.assertEqual(
  142. channel.json_body, {"og:title": "~matrix~", "og:description": "hi"}
  143. )
  144. # Check the cache returns the correct response
  145. channel = self.make_request(
  146. "GET", "preview_url?url=http://matrix.org", shorthand=False
  147. )
  148. # Check the cache response has the same content
  149. self.assertEqual(channel.code, 200)
  150. self.assertEqual(
  151. channel.json_body, {"og:title": "~matrix~", "og:description": "hi"}
  152. )
  153. # Clear the in-memory cache
  154. self.assertIn("http://matrix.org", self.preview_url._cache)
  155. self.preview_url._cache.pop("http://matrix.org")
  156. self.assertNotIn("http://matrix.org", self.preview_url._cache)
  157. # Check the database cache returns the correct response
  158. channel = self.make_request(
  159. "GET", "preview_url?url=http://matrix.org", shorthand=False
  160. )
  161. # Check the cache response has the same content
  162. self.assertEqual(channel.code, 200)
  163. self.assertEqual(
  164. channel.json_body, {"og:title": "~matrix~", "og:description": "hi"}
  165. )
  166. def test_non_ascii_preview_httpequiv(self):
  167. self.lookups["matrix.org"] = [(IPv4Address, "10.1.2.3")]
  168. end_content = (
  169. b"<html><head>"
  170. b'<meta http-equiv="Content-Type" content="text/html; charset=windows-1251"/>'
  171. b'<meta property="og:title" content="\xe4\xea\xe0" />'
  172. b'<meta property="og:description" content="hi" />'
  173. b"</head></html>"
  174. )
  175. channel = self.make_request(
  176. "GET",
  177. "preview_url?url=http://matrix.org",
  178. shorthand=False,
  179. await_result=False,
  180. )
  181. self.pump()
  182. client = self.reactor.tcpClients[0][2].buildProtocol(None)
  183. server = AccumulatingProtocol()
  184. server.makeConnection(FakeTransport(client, self.reactor))
  185. client.makeConnection(FakeTransport(server, self.reactor))
  186. client.dataReceived(
  187. (
  188. b"HTTP/1.0 200 OK\r\nContent-Length: %d\r\n"
  189. b'Content-Type: text/html; charset="utf8"\r\n\r\n'
  190. )
  191. % (len(end_content),)
  192. + end_content
  193. )
  194. self.pump()
  195. self.assertEqual(channel.code, 200)
  196. self.assertEqual(channel.json_body["og:title"], "\u0434\u043a\u0430")
  197. def test_non_ascii_preview_content_type(self):
  198. self.lookups["matrix.org"] = [(IPv4Address, "10.1.2.3")]
  199. end_content = (
  200. b"<html><head>"
  201. b'<meta property="og:title" content="\xe4\xea\xe0" />'
  202. b'<meta property="og:description" content="hi" />'
  203. b"</head></html>"
  204. )
  205. channel = self.make_request(
  206. "GET",
  207. "preview_url?url=http://matrix.org",
  208. shorthand=False,
  209. await_result=False,
  210. )
  211. self.pump()
  212. client = self.reactor.tcpClients[0][2].buildProtocol(None)
  213. server = AccumulatingProtocol()
  214. server.makeConnection(FakeTransport(client, self.reactor))
  215. client.makeConnection(FakeTransport(server, self.reactor))
  216. client.dataReceived(
  217. (
  218. b"HTTP/1.0 200 OK\r\nContent-Length: %d\r\n"
  219. b'Content-Type: text/html; charset="windows-1251"\r\n\r\n'
  220. )
  221. % (len(end_content),)
  222. + end_content
  223. )
  224. self.pump()
  225. self.assertEqual(channel.code, 200)
  226. self.assertEqual(channel.json_body["og:title"], "\u0434\u043a\u0430")
  227. def test_overlong_title(self):
  228. self.lookups["matrix.org"] = [(IPv4Address, "10.1.2.3")]
  229. end_content = (
  230. b"<html><head>"
  231. b"<title>" + b"x" * 2000 + b"</title>"
  232. b'<meta property="og:description" content="hi" />'
  233. b"</head></html>"
  234. )
  235. channel = self.make_request(
  236. "GET",
  237. "preview_url?url=http://matrix.org",
  238. shorthand=False,
  239. await_result=False,
  240. )
  241. self.pump()
  242. client = self.reactor.tcpClients[0][2].buildProtocol(None)
  243. server = AccumulatingProtocol()
  244. server.makeConnection(FakeTransport(client, self.reactor))
  245. client.makeConnection(FakeTransport(server, self.reactor))
  246. client.dataReceived(
  247. (
  248. b"HTTP/1.0 200 OK\r\nContent-Length: %d\r\n"
  249. b'Content-Type: text/html; charset="windows-1251"\r\n\r\n'
  250. )
  251. % (len(end_content),)
  252. + end_content
  253. )
  254. self.pump()
  255. self.assertEqual(channel.code, 200)
  256. res = channel.json_body
  257. # We should only see the `og:description` field, as `title` is too long and should be stripped out
  258. self.assertCountEqual(["og:description"], res.keys())
  259. def test_ipaddr(self):
  260. """
  261. IP addresses can be previewed directly.
  262. """
  263. self.lookups["example.com"] = [(IPv4Address, "10.1.2.3")]
  264. channel = self.make_request(
  265. "GET",
  266. "preview_url?url=http://example.com",
  267. shorthand=False,
  268. await_result=False,
  269. )
  270. self.pump()
  271. client = self.reactor.tcpClients[0][2].buildProtocol(None)
  272. server = AccumulatingProtocol()
  273. server.makeConnection(FakeTransport(client, self.reactor))
  274. client.makeConnection(FakeTransport(server, self.reactor))
  275. client.dataReceived(
  276. b"HTTP/1.0 200 OK\r\nContent-Length: %d\r\nContent-Type: text/html\r\n\r\n"
  277. % (len(self.end_content),)
  278. + self.end_content
  279. )
  280. self.pump()
  281. self.assertEqual(channel.code, 200)
  282. self.assertEqual(
  283. channel.json_body, {"og:title": "~matrix~", "og:description": "hi"}
  284. )
  285. def test_blacklisted_ip_specific(self):
  286. """
  287. Blacklisted IP addresses, found via DNS, are not spidered.
  288. """
  289. self.lookups["example.com"] = [(IPv4Address, "192.168.1.1")]
  290. channel = self.make_request(
  291. "GET", "preview_url?url=http://example.com", shorthand=False
  292. )
  293. # No requests made.
  294. self.assertEqual(len(self.reactor.tcpClients), 0)
  295. self.assertEqual(channel.code, 502)
  296. self.assertEqual(
  297. channel.json_body,
  298. {
  299. "errcode": "M_UNKNOWN",
  300. "error": "DNS resolution failure during URL preview generation",
  301. },
  302. )
  303. def test_blacklisted_ip_range(self):
  304. """
  305. Blacklisted IP ranges, IPs found over DNS, are not spidered.
  306. """
  307. self.lookups["example.com"] = [(IPv4Address, "1.1.1.2")]
  308. channel = self.make_request(
  309. "GET", "preview_url?url=http://example.com", shorthand=False
  310. )
  311. self.assertEqual(channel.code, 502)
  312. self.assertEqual(
  313. channel.json_body,
  314. {
  315. "errcode": "M_UNKNOWN",
  316. "error": "DNS resolution failure during URL preview generation",
  317. },
  318. )
  319. def test_blacklisted_ip_specific_direct(self):
  320. """
  321. Blacklisted IP addresses, accessed directly, are not spidered.
  322. """
  323. channel = self.make_request(
  324. "GET", "preview_url?url=http://192.168.1.1", shorthand=False
  325. )
  326. # No requests made.
  327. self.assertEqual(len(self.reactor.tcpClients), 0)
  328. self.assertEqual(
  329. channel.json_body,
  330. {
  331. "errcode": "M_UNKNOWN",
  332. "error": "IP address blocked by IP blacklist entry",
  333. },
  334. )
  335. self.assertEqual(channel.code, 403)
  336. def test_blacklisted_ip_range_direct(self):
  337. """
  338. Blacklisted IP ranges, accessed directly, are not spidered.
  339. """
  340. channel = self.make_request(
  341. "GET", "preview_url?url=http://1.1.1.2", shorthand=False
  342. )
  343. self.assertEqual(channel.code, 403)
  344. self.assertEqual(
  345. channel.json_body,
  346. {
  347. "errcode": "M_UNKNOWN",
  348. "error": "IP address blocked by IP blacklist entry",
  349. },
  350. )
  351. def test_blacklisted_ip_range_whitelisted_ip(self):
  352. """
  353. Blacklisted but then subsequently whitelisted IP addresses can be
  354. spidered.
  355. """
  356. self.lookups["example.com"] = [(IPv4Address, "1.1.1.1")]
  357. channel = self.make_request(
  358. "GET",
  359. "preview_url?url=http://example.com",
  360. shorthand=False,
  361. await_result=False,
  362. )
  363. self.pump()
  364. client = self.reactor.tcpClients[0][2].buildProtocol(None)
  365. server = AccumulatingProtocol()
  366. server.makeConnection(FakeTransport(client, self.reactor))
  367. client.makeConnection(FakeTransport(server, self.reactor))
  368. client.dataReceived(
  369. b"HTTP/1.0 200 OK\r\nContent-Length: %d\r\nContent-Type: text/html\r\n\r\n"
  370. % (len(self.end_content),)
  371. + self.end_content
  372. )
  373. self.pump()
  374. self.assertEqual(channel.code, 200)
  375. self.assertEqual(
  376. channel.json_body, {"og:title": "~matrix~", "og:description": "hi"}
  377. )
  378. def test_blacklisted_ip_with_external_ip(self):
  379. """
  380. If a hostname resolves a blacklisted IP, even if there's a
  381. non-blacklisted one, it will be rejected.
  382. """
  383. # Hardcode the URL resolving to the IP we want.
  384. self.lookups["example.com"] = [
  385. (IPv4Address, "1.1.1.2"),
  386. (IPv4Address, "10.1.2.3"),
  387. ]
  388. channel = self.make_request(
  389. "GET", "preview_url?url=http://example.com", shorthand=False
  390. )
  391. self.assertEqual(channel.code, 502)
  392. self.assertEqual(
  393. channel.json_body,
  394. {
  395. "errcode": "M_UNKNOWN",
  396. "error": "DNS resolution failure during URL preview generation",
  397. },
  398. )
  399. def test_blacklisted_ipv6_specific(self):
  400. """
  401. Blacklisted IP addresses, found via DNS, are not spidered.
  402. """
  403. self.lookups["example.com"] = [
  404. (IPv6Address, "3fff:ffff:ffff:ffff:ffff:ffff:ffff:ffff")
  405. ]
  406. channel = self.make_request(
  407. "GET", "preview_url?url=http://example.com", shorthand=False
  408. )
  409. # No requests made.
  410. self.assertEqual(len(self.reactor.tcpClients), 0)
  411. self.assertEqual(channel.code, 502)
  412. self.assertEqual(
  413. channel.json_body,
  414. {
  415. "errcode": "M_UNKNOWN",
  416. "error": "DNS resolution failure during URL preview generation",
  417. },
  418. )
  419. def test_blacklisted_ipv6_range(self):
  420. """
  421. Blacklisted IP ranges, IPs found over DNS, are not spidered.
  422. """
  423. self.lookups["example.com"] = [(IPv6Address, "2001:800::1")]
  424. channel = self.make_request(
  425. "GET", "preview_url?url=http://example.com", shorthand=False
  426. )
  427. self.assertEqual(channel.code, 502)
  428. self.assertEqual(
  429. channel.json_body,
  430. {
  431. "errcode": "M_UNKNOWN",
  432. "error": "DNS resolution failure during URL preview generation",
  433. },
  434. )
  435. def test_OPTIONS(self):
  436. """
  437. OPTIONS returns the OPTIONS.
  438. """
  439. channel = self.make_request(
  440. "OPTIONS", "preview_url?url=http://example.com", shorthand=False
  441. )
  442. self.assertEqual(channel.code, 200)
  443. self.assertEqual(channel.json_body, {})
  444. def test_accept_language_config_option(self):
  445. """
  446. Accept-Language header is sent to the remote server
  447. """
  448. self.lookups["example.com"] = [(IPv4Address, "10.1.2.3")]
  449. # Build and make a request to the server
  450. channel = self.make_request(
  451. "GET",
  452. "preview_url?url=http://example.com",
  453. shorthand=False,
  454. await_result=False,
  455. )
  456. self.pump()
  457. # Extract Synapse's tcp client
  458. client = self.reactor.tcpClients[0][2].buildProtocol(None)
  459. # Build a fake remote server to reply with
  460. server = AccumulatingProtocol()
  461. # Connect the two together
  462. server.makeConnection(FakeTransport(client, self.reactor))
  463. client.makeConnection(FakeTransport(server, self.reactor))
  464. # Tell Synapse that it has received some data from the remote server
  465. client.dataReceived(
  466. b"HTTP/1.0 200 OK\r\nContent-Length: %d\r\nContent-Type: text/html\r\n\r\n"
  467. % (len(self.end_content),)
  468. + self.end_content
  469. )
  470. # Move the reactor along until we get a response on our original channel
  471. self.pump()
  472. self.assertEqual(channel.code, 200)
  473. self.assertEqual(
  474. channel.json_body, {"og:title": "~matrix~", "og:description": "hi"}
  475. )
  476. # Check that the server received the Accept-Language header as part
  477. # of the request from Synapse
  478. self.assertIn(
  479. (
  480. b"Accept-Language: en-UK\r\n"
  481. b"Accept-Language: en-US;q=0.9\r\n"
  482. b"Accept-Language: fr;q=0.8\r\n"
  483. b"Accept-Language: *;q=0.7"
  484. ),
  485. server.data,
  486. )
  487. def test_oembed_photo(self):
  488. """Test an oEmbed endpoint which returns a 'photo' type which redirects the preview to a new URL."""
  489. self.lookups["publish.twitter.com"] = [(IPv4Address, "10.1.2.3")]
  490. self.lookups["cdn.twitter.com"] = [(IPv4Address, "10.1.2.3")]
  491. result = {
  492. "version": "1.0",
  493. "type": "photo",
  494. "url": "http://cdn.twitter.com/matrixdotorg",
  495. }
  496. oembed_content = json.dumps(result).encode("utf-8")
  497. channel = self.make_request(
  498. "GET",
  499. "preview_url?url=http://twitter.com/matrixdotorg/status/12345",
  500. shorthand=False,
  501. await_result=False,
  502. )
  503. self.pump()
  504. client = self.reactor.tcpClients[0][2].buildProtocol(None)
  505. server = AccumulatingProtocol()
  506. server.makeConnection(FakeTransport(client, self.reactor))
  507. client.makeConnection(FakeTransport(server, self.reactor))
  508. client.dataReceived(
  509. (
  510. b"HTTP/1.0 200 OK\r\nContent-Length: %d\r\n"
  511. b'Content-Type: application/json; charset="utf8"\r\n\r\n'
  512. )
  513. % (len(oembed_content),)
  514. + oembed_content
  515. )
  516. self.pump()
  517. # Ensure a second request is made to the photo URL.
  518. client = self.reactor.tcpClients[1][2].buildProtocol(None)
  519. server = AccumulatingProtocol()
  520. server.makeConnection(FakeTransport(client, self.reactor))
  521. client.makeConnection(FakeTransport(server, self.reactor))
  522. client.dataReceived(
  523. (
  524. b"HTTP/1.0 200 OK\r\nContent-Length: %d\r\n"
  525. b"Content-Type: image/png\r\n\r\n"
  526. )
  527. % (len(SMALL_PNG),)
  528. + SMALL_PNG
  529. )
  530. self.pump()
  531. # Ensure the URL is what was requested.
  532. self.assertIn(b"/matrixdotorg", server.data)
  533. self.assertEqual(channel.code, 200)
  534. body = channel.json_body
  535. self.assertEqual(body["og:url"], "http://twitter.com/matrixdotorg/status/12345")
  536. self.assertTrue(body["og:image"].startswith("mxc://"))
  537. self.assertEqual(body["og:image:height"], 1)
  538. self.assertEqual(body["og:image:width"], 1)
  539. self.assertEqual(body["og:image:type"], "image/png")
  540. def test_oembed_rich(self):
  541. """Test an oEmbed endpoint which returns HTML content via the 'rich' type."""
  542. self.lookups["publish.twitter.com"] = [(IPv4Address, "10.1.2.3")]
  543. result = {
  544. "version": "1.0",
  545. "type": "rich",
  546. # Note that this provides the author, not the title.
  547. "author_name": "Alice",
  548. "html": "<div>Content Preview</div>",
  549. }
  550. end_content = json.dumps(result).encode("utf-8")
  551. channel = self.make_request(
  552. "GET",
  553. "preview_url?url=http://twitter.com/matrixdotorg/status/12345",
  554. shorthand=False,
  555. await_result=False,
  556. )
  557. self.pump()
  558. client = self.reactor.tcpClients[0][2].buildProtocol(None)
  559. server = AccumulatingProtocol()
  560. server.makeConnection(FakeTransport(client, self.reactor))
  561. client.makeConnection(FakeTransport(server, self.reactor))
  562. client.dataReceived(
  563. (
  564. b"HTTP/1.0 200 OK\r\nContent-Length: %d\r\n"
  565. b'Content-Type: application/json; charset="utf8"\r\n\r\n'
  566. )
  567. % (len(end_content),)
  568. + end_content
  569. )
  570. self.pump()
  571. self.assertEqual(channel.code, 200)
  572. body = channel.json_body
  573. self.assertEqual(
  574. body,
  575. {
  576. "og:url": "http://twitter.com/matrixdotorg/status/12345",
  577. "og:title": "Alice",
  578. "og:description": "Content Preview",
  579. },
  580. )
  581. def test_oembed_format(self):
  582. """Test an oEmbed endpoint which requires the format in the URL."""
  583. self.lookups["www.hulu.com"] = [(IPv4Address, "10.1.2.3")]
  584. result = {
  585. "version": "1.0",
  586. "type": "rich",
  587. "html": "<div>Content Preview</div>",
  588. }
  589. end_content = json.dumps(result).encode("utf-8")
  590. channel = self.make_request(
  591. "GET",
  592. "preview_url?url=http://www.hulu.com/watch/12345",
  593. shorthand=False,
  594. await_result=False,
  595. )
  596. self.pump()
  597. client = self.reactor.tcpClients[0][2].buildProtocol(None)
  598. server = AccumulatingProtocol()
  599. server.makeConnection(FakeTransport(client, self.reactor))
  600. client.makeConnection(FakeTransport(server, self.reactor))
  601. client.dataReceived(
  602. (
  603. b"HTTP/1.0 200 OK\r\nContent-Length: %d\r\n"
  604. b'Content-Type: application/json; charset="utf8"\r\n\r\n'
  605. )
  606. % (len(end_content),)
  607. + end_content
  608. )
  609. self.pump()
  610. # The {format} should have been turned into json.
  611. self.assertIn(b"/api/oembed.json", server.data)
  612. # A URL parameter of format=json should be provided.
  613. self.assertIn(b"format=json", server.data)
  614. self.assertEqual(channel.code, 200)
  615. body = channel.json_body
  616. self.assertEqual(
  617. body,
  618. {
  619. "og:url": "http://www.hulu.com/watch/12345",
  620. "og:description": "Content Preview",
  621. },
  622. )
  623. def test_oembed_autodiscovery(self):
  624. """
  625. Autodiscovery works by finding the link in the HTML response and then requesting an oEmbed URL.
  626. 1. Request a preview of a URL which is not known to the oEmbed code.
  627. 2. It returns HTML including a link to an oEmbed preview.
  628. 3. The oEmbed preview is requested and returns a URL for an image.
  629. 4. The image is requested for thumbnailing.
  630. """
  631. # This is a little cheesy in that we use the www subdomain (which isn't the
  632. # list of oEmbed patterns) to get "raw" HTML response.
  633. self.lookups["www.twitter.com"] = [(IPv4Address, "10.1.2.3")]
  634. self.lookups["publish.twitter.com"] = [(IPv4Address, "10.1.2.3")]
  635. self.lookups["cdn.twitter.com"] = [(IPv4Address, "10.1.2.3")]
  636. result = b"""
  637. <link rel="alternate" type="application/json+oembed"
  638. href="http://publish.twitter.com/oembed?url=http%3A%2F%2Fcdn.twitter.com%2Fmatrixdotorg%2Fstatus%2F12345&format=json"
  639. title="matrixdotorg" />
  640. """
  641. channel = self.make_request(
  642. "GET",
  643. "preview_url?url=http://www.twitter.com/matrixdotorg/status/12345",
  644. shorthand=False,
  645. await_result=False,
  646. )
  647. self.pump()
  648. client = self.reactor.tcpClients[0][2].buildProtocol(None)
  649. server = AccumulatingProtocol()
  650. server.makeConnection(FakeTransport(client, self.reactor))
  651. client.makeConnection(FakeTransport(server, self.reactor))
  652. client.dataReceived(
  653. (
  654. b"HTTP/1.0 200 OK\r\nContent-Length: %d\r\n"
  655. b'Content-Type: text/html; charset="utf8"\r\n\r\n'
  656. )
  657. % (len(result),)
  658. + result
  659. )
  660. self.pump()
  661. # The oEmbed response.
  662. result2 = {
  663. "version": "1.0",
  664. "type": "photo",
  665. "url": "http://cdn.twitter.com/matrixdotorg",
  666. }
  667. oembed_content = json.dumps(result2).encode("utf-8")
  668. # Ensure a second request is made to the oEmbed URL.
  669. client = self.reactor.tcpClients[1][2].buildProtocol(None)
  670. server = AccumulatingProtocol()
  671. server.makeConnection(FakeTransport(client, self.reactor))
  672. client.makeConnection(FakeTransport(server, self.reactor))
  673. client.dataReceived(
  674. (
  675. b"HTTP/1.0 200 OK\r\nContent-Length: %d\r\n"
  676. b'Content-Type: application/json; charset="utf8"\r\n\r\n'
  677. )
  678. % (len(oembed_content),)
  679. + oembed_content
  680. )
  681. self.pump()
  682. # Ensure the URL is what was requested.
  683. self.assertIn(b"/oembed?", server.data)
  684. # Ensure a third request is made to the photo URL.
  685. client = self.reactor.tcpClients[2][2].buildProtocol(None)
  686. server = AccumulatingProtocol()
  687. server.makeConnection(FakeTransport(client, self.reactor))
  688. client.makeConnection(FakeTransport(server, self.reactor))
  689. client.dataReceived(
  690. (
  691. b"HTTP/1.0 200 OK\r\nContent-Length: %d\r\n"
  692. b"Content-Type: image/png\r\n\r\n"
  693. )
  694. % (len(SMALL_PNG),)
  695. + SMALL_PNG
  696. )
  697. self.pump()
  698. # Ensure the URL is what was requested.
  699. self.assertIn(b"/matrixdotorg", server.data)
  700. self.assertEqual(channel.code, 200)
  701. body = channel.json_body
  702. self.assertEqual(
  703. body["og:url"], "http://www.twitter.com/matrixdotorg/status/12345"
  704. )
  705. self.assertTrue(body["og:image"].startswith("mxc://"))
  706. self.assertEqual(body["og:image:height"], 1)
  707. self.assertEqual(body["og:image:width"], 1)
  708. self.assertEqual(body["og:image:type"], "image/png")
  709. def _download_image(self):
  710. """Downloads an image into the URL cache.
  711. Returns:
  712. A (host, media_id) tuple representing the MXC URI of the image.
  713. """
  714. self.lookups["cdn.twitter.com"] = [(IPv4Address, "10.1.2.3")]
  715. channel = self.make_request(
  716. "GET",
  717. "preview_url?url=http://cdn.twitter.com/matrixdotorg",
  718. shorthand=False,
  719. await_result=False,
  720. )
  721. self.pump()
  722. client = self.reactor.tcpClients[0][2].buildProtocol(None)
  723. server = AccumulatingProtocol()
  724. server.makeConnection(FakeTransport(client, self.reactor))
  725. client.makeConnection(FakeTransport(server, self.reactor))
  726. client.dataReceived(
  727. b"HTTP/1.0 200 OK\r\nContent-Length: %d\r\nContent-Type: image/png\r\n\r\n"
  728. % (len(SMALL_PNG),)
  729. + SMALL_PNG
  730. )
  731. self.pump()
  732. self.assertEqual(channel.code, 200)
  733. body = channel.json_body
  734. mxc_uri = body["og:image"]
  735. host, _port, media_id = parse_and_validate_mxc_uri(mxc_uri)
  736. self.assertIsNone(_port)
  737. return host, media_id
  738. def test_storage_providers_exclude_files(self):
  739. """Test that files are not stored in or fetched from storage providers."""
  740. host, media_id = self._download_image()
  741. rel_file_path = self.preview_url.filepaths.url_cache_filepath_rel(media_id)
  742. media_store_path = os.path.join(self.media_store_path, rel_file_path)
  743. storage_provider_path = os.path.join(self.storage_path, rel_file_path)
  744. # Check storage
  745. self.assertTrue(os.path.isfile(media_store_path))
  746. self.assertFalse(
  747. os.path.isfile(storage_provider_path),
  748. "URL cache file was unexpectedly stored in a storage provider",
  749. )
  750. # Check fetching
  751. channel = self.make_request(
  752. "GET",
  753. f"download/{host}/{media_id}",
  754. shorthand=False,
  755. await_result=False,
  756. )
  757. self.pump()
  758. self.assertEqual(channel.code, 200)
  759. # Move cached file into the storage provider
  760. os.makedirs(os.path.dirname(storage_provider_path), exist_ok=True)
  761. os.rename(media_store_path, storage_provider_path)
  762. channel = self.make_request(
  763. "GET",
  764. f"download/{host}/{media_id}",
  765. shorthand=False,
  766. await_result=False,
  767. )
  768. self.pump()
  769. self.assertEqual(
  770. channel.code,
  771. 404,
  772. "URL cache file was unexpectedly retrieved from a storage provider",
  773. )
  774. def test_storage_providers_exclude_thumbnails(self):
  775. """Test that thumbnails are not stored in or fetched from storage providers."""
  776. host, media_id = self._download_image()
  777. rel_thumbnail_path = (
  778. self.preview_url.filepaths.url_cache_thumbnail_directory_rel(media_id)
  779. )
  780. media_store_thumbnail_path = os.path.join(
  781. self.media_store_path, rel_thumbnail_path
  782. )
  783. storage_provider_thumbnail_path = os.path.join(
  784. self.storage_path, rel_thumbnail_path
  785. )
  786. # Check storage
  787. self.assertTrue(os.path.isdir(media_store_thumbnail_path))
  788. self.assertFalse(
  789. os.path.isdir(storage_provider_thumbnail_path),
  790. "URL cache thumbnails were unexpectedly stored in a storage provider",
  791. )
  792. # Check fetching
  793. channel = self.make_request(
  794. "GET",
  795. f"thumbnail/{host}/{media_id}?width=32&height=32&method=scale",
  796. shorthand=False,
  797. await_result=False,
  798. )
  799. self.pump()
  800. self.assertEqual(channel.code, 200)
  801. # Remove the original, otherwise thumbnails will regenerate
  802. rel_file_path = self.preview_url.filepaths.url_cache_filepath_rel(media_id)
  803. media_store_path = os.path.join(self.media_store_path, rel_file_path)
  804. os.remove(media_store_path)
  805. # Move cached thumbnails into the storage provider
  806. os.makedirs(os.path.dirname(storage_provider_thumbnail_path), exist_ok=True)
  807. os.rename(media_store_thumbnail_path, storage_provider_thumbnail_path)
  808. channel = self.make_request(
  809. "GET",
  810. f"thumbnail/{host}/{media_id}?width=32&height=32&method=scale",
  811. shorthand=False,
  812. await_result=False,
  813. )
  814. self.pump()
  815. self.assertEqual(
  816. channel.code,
  817. 404,
  818. "URL cache thumbnail was unexpectedly retrieved from a storage provider",
  819. )
  820. def test_cache_expiry(self):
  821. """Test that URL cache files and thumbnails are cleaned up properly on expiry."""
  822. self.preview_url.clock = MockClock()
  823. _host, media_id = self._download_image()
  824. file_path = self.preview_url.filepaths.url_cache_filepath(media_id)
  825. file_dirs = self.preview_url.filepaths.url_cache_filepath_dirs_to_delete(
  826. media_id
  827. )
  828. thumbnail_dir = self.preview_url.filepaths.url_cache_thumbnail_directory(
  829. media_id
  830. )
  831. thumbnail_dirs = self.preview_url.filepaths.url_cache_thumbnail_dirs_to_delete(
  832. media_id
  833. )
  834. self.assertTrue(os.path.isfile(file_path))
  835. self.assertTrue(os.path.isdir(thumbnail_dir))
  836. self.preview_url.clock.advance_time_msec(IMAGE_CACHE_EXPIRY_MS + 1)
  837. self.get_success(self.preview_url._expire_url_cache_data())
  838. for path in [file_path] + file_dirs + [thumbnail_dir] + thumbnail_dirs:
  839. self.assertFalse(
  840. os.path.exists(path),
  841. f"{os.path.relpath(path, self.media_store_path)} was not deleted",
  842. )