test_url_preview.py 33 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982
  1. # Copyright 2018 New Vector Ltd
  2. #
  3. # Licensed under the Apache License, Version 2.0 (the "License");
  4. # you may not use this file except in compliance with the License.
  5. # You may obtain a copy of the License at
  6. #
  7. # http://www.apache.org/licenses/LICENSE-2.0
  8. #
  9. # Unless required by applicable law or agreed to in writing, software
  10. # distributed under the License is distributed on an "AS IS" BASIS,
  11. # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  12. # See the License for the specific language governing permissions and
  13. # limitations under the License.
  14. import json
  15. import os
  16. import re
  17. from twisted.internet._resolver import HostResolution
  18. from twisted.internet.address import IPv4Address, IPv6Address
  19. from twisted.internet.error import DNSLookupError
  20. from twisted.test.proto_helpers import AccumulatingProtocol
  21. from synapse.config.oembed import OEmbedEndpointConfig
  22. from synapse.rest.media.v1.preview_url_resource import IMAGE_CACHE_EXPIRY_MS
  23. from synapse.util.stringutils import parse_and_validate_mxc_uri
  24. from tests import unittest
  25. from tests.server import FakeTransport
  26. from tests.test_utils import SMALL_PNG
  27. from tests.utils import MockClock
  28. try:
  29. import lxml
  30. except ImportError:
  31. lxml = None
  32. class URLPreviewTests(unittest.HomeserverTestCase):
  33. if not lxml:
  34. skip = "url preview feature requires lxml"
  35. hijack_auth = True
  36. user_id = "@test:user"
  37. end_content = (
  38. b"<html><head>"
  39. b'<meta property="og:title" content="~matrix~" />'
  40. b'<meta property="og:description" content="hi" />'
  41. b"</head></html>"
  42. )
  43. def make_homeserver(self, reactor, clock):
  44. config = self.default_config()
  45. config["url_preview_enabled"] = True
  46. config["max_spider_size"] = 9999999
  47. config["url_preview_ip_range_blacklist"] = (
  48. "192.168.1.1",
  49. "1.0.0.0/8",
  50. "3fff:ffff:ffff:ffff:ffff:ffff:ffff:ffff",
  51. "2001:800::/21",
  52. )
  53. config["url_preview_ip_range_whitelist"] = ("1.1.1.1",)
  54. config["url_preview_url_blacklist"] = []
  55. config["url_preview_accept_language"] = [
  56. "en-UK",
  57. "en-US;q=0.9",
  58. "fr;q=0.8",
  59. "*;q=0.7",
  60. ]
  61. self.storage_path = self.mktemp()
  62. self.media_store_path = self.mktemp()
  63. os.mkdir(self.storage_path)
  64. os.mkdir(self.media_store_path)
  65. config["media_store_path"] = self.media_store_path
  66. provider_config = {
  67. "module": "synapse.rest.media.v1.storage_provider.FileStorageProviderBackend",
  68. "store_local": True,
  69. "store_synchronous": False,
  70. "store_remote": True,
  71. "config": {"directory": self.storage_path},
  72. }
  73. config["media_storage_providers"] = [provider_config]
  74. hs = self.setup_test_homeserver(config=config)
  75. # After the hs is created, modify the parsed oEmbed config (to avoid
  76. # messing with files).
  77. #
  78. # Note that HTTP URLs are used to avoid having to deal with TLS in tests.
  79. hs.config.oembed.oembed_patterns = [
  80. OEmbedEndpointConfig(
  81. api_endpoint="http://publish.twitter.com/oembed",
  82. url_patterns=[
  83. re.compile(r"http://twitter\.com/.+/status/.+"),
  84. ],
  85. formats=None,
  86. ),
  87. OEmbedEndpointConfig(
  88. api_endpoint="http://www.hulu.com/api/oembed.{format}",
  89. url_patterns=[
  90. re.compile(r"http://www\.hulu\.com/watch/.+"),
  91. ],
  92. formats=["json"],
  93. ),
  94. ]
  95. return hs
  96. def prepare(self, reactor, clock, hs):
  97. self.media_repo = hs.get_media_repository_resource()
  98. self.preview_url = self.media_repo.children[b"preview_url"]
  99. self.lookups = {}
  100. class Resolver:
  101. def resolveHostName(
  102. _self,
  103. resolutionReceiver,
  104. hostName,
  105. portNumber=0,
  106. addressTypes=None,
  107. transportSemantics="TCP",
  108. ):
  109. resolution = HostResolution(hostName)
  110. resolutionReceiver.resolutionBegan(resolution)
  111. if hostName not in self.lookups:
  112. raise DNSLookupError("OH NO")
  113. for i in self.lookups[hostName]:
  114. resolutionReceiver.addressResolved(i[0]("TCP", i[1], portNumber))
  115. resolutionReceiver.resolutionComplete()
  116. return resolutionReceiver
  117. self.reactor.nameResolver = Resolver()
  118. def create_test_resource(self):
  119. return self.hs.get_media_repository_resource()
  120. def test_cache_returns_correct_type(self):
  121. self.lookups["matrix.org"] = [(IPv4Address, "10.1.2.3")]
  122. channel = self.make_request(
  123. "GET",
  124. "preview_url?url=http://matrix.org",
  125. shorthand=False,
  126. await_result=False,
  127. )
  128. self.pump()
  129. client = self.reactor.tcpClients[0][2].buildProtocol(None)
  130. server = AccumulatingProtocol()
  131. server.makeConnection(FakeTransport(client, self.reactor))
  132. client.makeConnection(FakeTransport(server, self.reactor))
  133. client.dataReceived(
  134. b"HTTP/1.0 200 OK\r\nContent-Length: %d\r\nContent-Type: text/html\r\n\r\n"
  135. % (len(self.end_content),)
  136. + self.end_content
  137. )
  138. self.pump()
  139. self.assertEqual(channel.code, 200)
  140. self.assertEqual(
  141. channel.json_body, {"og:title": "~matrix~", "og:description": "hi"}
  142. )
  143. # Check the cache returns the correct response
  144. channel = self.make_request(
  145. "GET", "preview_url?url=http://matrix.org", shorthand=False
  146. )
  147. # Check the cache response has the same content
  148. self.assertEqual(channel.code, 200)
  149. self.assertEqual(
  150. channel.json_body, {"og:title": "~matrix~", "og:description": "hi"}
  151. )
  152. # Clear the in-memory cache
  153. self.assertIn("http://matrix.org", self.preview_url._cache)
  154. self.preview_url._cache.pop("http://matrix.org")
  155. self.assertNotIn("http://matrix.org", self.preview_url._cache)
  156. # Check the database cache returns the correct response
  157. channel = self.make_request(
  158. "GET", "preview_url?url=http://matrix.org", shorthand=False
  159. )
  160. # Check the cache response has the same content
  161. self.assertEqual(channel.code, 200)
  162. self.assertEqual(
  163. channel.json_body, {"og:title": "~matrix~", "og:description": "hi"}
  164. )
  165. def test_non_ascii_preview_httpequiv(self):
  166. self.lookups["matrix.org"] = [(IPv4Address, "10.1.2.3")]
  167. end_content = (
  168. b"<html><head>"
  169. b'<meta http-equiv="Content-Type" content="text/html; charset=windows-1251"/>'
  170. b'<meta property="og:title" content="\xe4\xea\xe0" />'
  171. b'<meta property="og:description" content="hi" />'
  172. b"</head></html>"
  173. )
  174. channel = self.make_request(
  175. "GET",
  176. "preview_url?url=http://matrix.org",
  177. shorthand=False,
  178. await_result=False,
  179. )
  180. self.pump()
  181. client = self.reactor.tcpClients[0][2].buildProtocol(None)
  182. server = AccumulatingProtocol()
  183. server.makeConnection(FakeTransport(client, self.reactor))
  184. client.makeConnection(FakeTransport(server, self.reactor))
  185. client.dataReceived(
  186. (
  187. b"HTTP/1.0 200 OK\r\nContent-Length: %d\r\n"
  188. b'Content-Type: text/html; charset="utf8"\r\n\r\n'
  189. )
  190. % (len(end_content),)
  191. + end_content
  192. )
  193. self.pump()
  194. self.assertEqual(channel.code, 200)
  195. self.assertEqual(channel.json_body["og:title"], "\u0434\u043a\u0430")
  196. def test_non_ascii_preview_content_type(self):
  197. self.lookups["matrix.org"] = [(IPv4Address, "10.1.2.3")]
  198. end_content = (
  199. b"<html><head>"
  200. b'<meta property="og:title" content="\xe4\xea\xe0" />'
  201. b'<meta property="og:description" content="hi" />'
  202. b"</head></html>"
  203. )
  204. channel = self.make_request(
  205. "GET",
  206. "preview_url?url=http://matrix.org",
  207. shorthand=False,
  208. await_result=False,
  209. )
  210. self.pump()
  211. client = self.reactor.tcpClients[0][2].buildProtocol(None)
  212. server = AccumulatingProtocol()
  213. server.makeConnection(FakeTransport(client, self.reactor))
  214. client.makeConnection(FakeTransport(server, self.reactor))
  215. client.dataReceived(
  216. (
  217. b"HTTP/1.0 200 OK\r\nContent-Length: %d\r\n"
  218. b'Content-Type: text/html; charset="windows-1251"\r\n\r\n'
  219. )
  220. % (len(end_content),)
  221. + end_content
  222. )
  223. self.pump()
  224. self.assertEqual(channel.code, 200)
  225. self.assertEqual(channel.json_body["og:title"], "\u0434\u043a\u0430")
  226. def test_overlong_title(self):
  227. self.lookups["matrix.org"] = [(IPv4Address, "10.1.2.3")]
  228. end_content = (
  229. b"<html><head>"
  230. b"<title>" + b"x" * 2000 + b"</title>"
  231. b'<meta property="og:description" content="hi" />'
  232. b"</head></html>"
  233. )
  234. channel = self.make_request(
  235. "GET",
  236. "preview_url?url=http://matrix.org",
  237. shorthand=False,
  238. await_result=False,
  239. )
  240. self.pump()
  241. client = self.reactor.tcpClients[0][2].buildProtocol(None)
  242. server = AccumulatingProtocol()
  243. server.makeConnection(FakeTransport(client, self.reactor))
  244. client.makeConnection(FakeTransport(server, self.reactor))
  245. client.dataReceived(
  246. (
  247. b"HTTP/1.0 200 OK\r\nContent-Length: %d\r\n"
  248. b'Content-Type: text/html; charset="windows-1251"\r\n\r\n'
  249. )
  250. % (len(end_content),)
  251. + end_content
  252. )
  253. self.pump()
  254. self.assertEqual(channel.code, 200)
  255. res = channel.json_body
  256. # We should only see the `og:description` field, as `title` is too long and should be stripped out
  257. self.assertCountEqual(["og:description"], res.keys())
  258. def test_ipaddr(self):
  259. """
  260. IP addresses can be previewed directly.
  261. """
  262. self.lookups["example.com"] = [(IPv4Address, "10.1.2.3")]
  263. channel = self.make_request(
  264. "GET",
  265. "preview_url?url=http://example.com",
  266. shorthand=False,
  267. await_result=False,
  268. )
  269. self.pump()
  270. client = self.reactor.tcpClients[0][2].buildProtocol(None)
  271. server = AccumulatingProtocol()
  272. server.makeConnection(FakeTransport(client, self.reactor))
  273. client.makeConnection(FakeTransport(server, self.reactor))
  274. client.dataReceived(
  275. b"HTTP/1.0 200 OK\r\nContent-Length: %d\r\nContent-Type: text/html\r\n\r\n"
  276. % (len(self.end_content),)
  277. + self.end_content
  278. )
  279. self.pump()
  280. self.assertEqual(channel.code, 200)
  281. self.assertEqual(
  282. channel.json_body, {"og:title": "~matrix~", "og:description": "hi"}
  283. )
  284. def test_blacklisted_ip_specific(self):
  285. """
  286. Blacklisted IP addresses, found via DNS, are not spidered.
  287. """
  288. self.lookups["example.com"] = [(IPv4Address, "192.168.1.1")]
  289. channel = self.make_request(
  290. "GET", "preview_url?url=http://example.com", shorthand=False
  291. )
  292. # No requests made.
  293. self.assertEqual(len(self.reactor.tcpClients), 0)
  294. self.assertEqual(channel.code, 502)
  295. self.assertEqual(
  296. channel.json_body,
  297. {
  298. "errcode": "M_UNKNOWN",
  299. "error": "DNS resolution failure during URL preview generation",
  300. },
  301. )
  302. def test_blacklisted_ip_range(self):
  303. """
  304. Blacklisted IP ranges, IPs found over DNS, are not spidered.
  305. """
  306. self.lookups["example.com"] = [(IPv4Address, "1.1.1.2")]
  307. channel = self.make_request(
  308. "GET", "preview_url?url=http://example.com", shorthand=False
  309. )
  310. self.assertEqual(channel.code, 502)
  311. self.assertEqual(
  312. channel.json_body,
  313. {
  314. "errcode": "M_UNKNOWN",
  315. "error": "DNS resolution failure during URL preview generation",
  316. },
  317. )
  318. def test_blacklisted_ip_specific_direct(self):
  319. """
  320. Blacklisted IP addresses, accessed directly, are not spidered.
  321. """
  322. channel = self.make_request(
  323. "GET", "preview_url?url=http://192.168.1.1", shorthand=False
  324. )
  325. # No requests made.
  326. self.assertEqual(len(self.reactor.tcpClients), 0)
  327. self.assertEqual(
  328. channel.json_body,
  329. {
  330. "errcode": "M_UNKNOWN",
  331. "error": "IP address blocked by IP blacklist entry",
  332. },
  333. )
  334. self.assertEqual(channel.code, 403)
  335. def test_blacklisted_ip_range_direct(self):
  336. """
  337. Blacklisted IP ranges, accessed directly, are not spidered.
  338. """
  339. channel = self.make_request(
  340. "GET", "preview_url?url=http://1.1.1.2", shorthand=False
  341. )
  342. self.assertEqual(channel.code, 403)
  343. self.assertEqual(
  344. channel.json_body,
  345. {
  346. "errcode": "M_UNKNOWN",
  347. "error": "IP address blocked by IP blacklist entry",
  348. },
  349. )
  350. def test_blacklisted_ip_range_whitelisted_ip(self):
  351. """
  352. Blacklisted but then subsequently whitelisted IP addresses can be
  353. spidered.
  354. """
  355. self.lookups["example.com"] = [(IPv4Address, "1.1.1.1")]
  356. channel = self.make_request(
  357. "GET",
  358. "preview_url?url=http://example.com",
  359. shorthand=False,
  360. await_result=False,
  361. )
  362. self.pump()
  363. client = self.reactor.tcpClients[0][2].buildProtocol(None)
  364. server = AccumulatingProtocol()
  365. server.makeConnection(FakeTransport(client, self.reactor))
  366. client.makeConnection(FakeTransport(server, self.reactor))
  367. client.dataReceived(
  368. b"HTTP/1.0 200 OK\r\nContent-Length: %d\r\nContent-Type: text/html\r\n\r\n"
  369. % (len(self.end_content),)
  370. + self.end_content
  371. )
  372. self.pump()
  373. self.assertEqual(channel.code, 200)
  374. self.assertEqual(
  375. channel.json_body, {"og:title": "~matrix~", "og:description": "hi"}
  376. )
  377. def test_blacklisted_ip_with_external_ip(self):
  378. """
  379. If a hostname resolves a blacklisted IP, even if there's a
  380. non-blacklisted one, it will be rejected.
  381. """
  382. # Hardcode the URL resolving to the IP we want.
  383. self.lookups["example.com"] = [
  384. (IPv4Address, "1.1.1.2"),
  385. (IPv4Address, "10.1.2.3"),
  386. ]
  387. channel = self.make_request(
  388. "GET", "preview_url?url=http://example.com", shorthand=False
  389. )
  390. self.assertEqual(channel.code, 502)
  391. self.assertEqual(
  392. channel.json_body,
  393. {
  394. "errcode": "M_UNKNOWN",
  395. "error": "DNS resolution failure during URL preview generation",
  396. },
  397. )
  398. def test_blacklisted_ipv6_specific(self):
  399. """
  400. Blacklisted IP addresses, found via DNS, are not spidered.
  401. """
  402. self.lookups["example.com"] = [
  403. (IPv6Address, "3fff:ffff:ffff:ffff:ffff:ffff:ffff:ffff")
  404. ]
  405. channel = self.make_request(
  406. "GET", "preview_url?url=http://example.com", shorthand=False
  407. )
  408. # No requests made.
  409. self.assertEqual(len(self.reactor.tcpClients), 0)
  410. self.assertEqual(channel.code, 502)
  411. self.assertEqual(
  412. channel.json_body,
  413. {
  414. "errcode": "M_UNKNOWN",
  415. "error": "DNS resolution failure during URL preview generation",
  416. },
  417. )
  418. def test_blacklisted_ipv6_range(self):
  419. """
  420. Blacklisted IP ranges, IPs found over DNS, are not spidered.
  421. """
  422. self.lookups["example.com"] = [(IPv6Address, "2001:800::1")]
  423. channel = self.make_request(
  424. "GET", "preview_url?url=http://example.com", shorthand=False
  425. )
  426. self.assertEqual(channel.code, 502)
  427. self.assertEqual(
  428. channel.json_body,
  429. {
  430. "errcode": "M_UNKNOWN",
  431. "error": "DNS resolution failure during URL preview generation",
  432. },
  433. )
  434. def test_OPTIONS(self):
  435. """
  436. OPTIONS returns the OPTIONS.
  437. """
  438. channel = self.make_request(
  439. "OPTIONS", "preview_url?url=http://example.com", shorthand=False
  440. )
  441. self.assertEqual(channel.code, 200)
  442. self.assertEqual(channel.json_body, {})
  443. def test_accept_language_config_option(self):
  444. """
  445. Accept-Language header is sent to the remote server
  446. """
  447. self.lookups["example.com"] = [(IPv4Address, "10.1.2.3")]
  448. # Build and make a request to the server
  449. channel = self.make_request(
  450. "GET",
  451. "preview_url?url=http://example.com",
  452. shorthand=False,
  453. await_result=False,
  454. )
  455. self.pump()
  456. # Extract Synapse's tcp client
  457. client = self.reactor.tcpClients[0][2].buildProtocol(None)
  458. # Build a fake remote server to reply with
  459. server = AccumulatingProtocol()
  460. # Connect the two together
  461. server.makeConnection(FakeTransport(client, self.reactor))
  462. client.makeConnection(FakeTransport(server, self.reactor))
  463. # Tell Synapse that it has received some data from the remote server
  464. client.dataReceived(
  465. b"HTTP/1.0 200 OK\r\nContent-Length: %d\r\nContent-Type: text/html\r\n\r\n"
  466. % (len(self.end_content),)
  467. + self.end_content
  468. )
  469. # Move the reactor along until we get a response on our original channel
  470. self.pump()
  471. self.assertEqual(channel.code, 200)
  472. self.assertEqual(
  473. channel.json_body, {"og:title": "~matrix~", "og:description": "hi"}
  474. )
  475. # Check that the server received the Accept-Language header as part
  476. # of the request from Synapse
  477. self.assertIn(
  478. (
  479. b"Accept-Language: en-UK\r\n"
  480. b"Accept-Language: en-US;q=0.9\r\n"
  481. b"Accept-Language: fr;q=0.8\r\n"
  482. b"Accept-Language: *;q=0.7"
  483. ),
  484. server.data,
  485. )
  486. def test_oembed_photo(self):
  487. """Test an oEmbed endpoint which returns a 'photo' type which redirects the preview to a new URL."""
  488. self.lookups["publish.twitter.com"] = [(IPv4Address, "10.1.2.3")]
  489. self.lookups["cdn.twitter.com"] = [(IPv4Address, "10.1.2.3")]
  490. result = {
  491. "version": "1.0",
  492. "type": "photo",
  493. "url": "http://cdn.twitter.com/matrixdotorg",
  494. }
  495. oembed_content = json.dumps(result).encode("utf-8")
  496. channel = self.make_request(
  497. "GET",
  498. "preview_url?url=http://twitter.com/matrixdotorg/status/12345",
  499. shorthand=False,
  500. await_result=False,
  501. )
  502. self.pump()
  503. client = self.reactor.tcpClients[0][2].buildProtocol(None)
  504. server = AccumulatingProtocol()
  505. server.makeConnection(FakeTransport(client, self.reactor))
  506. client.makeConnection(FakeTransport(server, self.reactor))
  507. client.dataReceived(
  508. (
  509. b"HTTP/1.0 200 OK\r\nContent-Length: %d\r\n"
  510. b'Content-Type: application/json; charset="utf8"\r\n\r\n'
  511. )
  512. % (len(oembed_content),)
  513. + oembed_content
  514. )
  515. self.pump()
  516. # Ensure a second request is made to the photo URL.
  517. client = self.reactor.tcpClients[1][2].buildProtocol(None)
  518. server = AccumulatingProtocol()
  519. server.makeConnection(FakeTransport(client, self.reactor))
  520. client.makeConnection(FakeTransport(server, self.reactor))
  521. client.dataReceived(
  522. (
  523. b"HTTP/1.0 200 OK\r\nContent-Length: %d\r\n"
  524. b"Content-Type: image/png\r\n\r\n"
  525. )
  526. % (len(SMALL_PNG),)
  527. + SMALL_PNG
  528. )
  529. self.pump()
  530. # Ensure the URL is what was requested.
  531. self.assertIn(b"/matrixdotorg", server.data)
  532. self.assertEqual(channel.code, 200)
  533. body = channel.json_body
  534. self.assertEqual(body["og:url"], "http://twitter.com/matrixdotorg/status/12345")
  535. self.assertTrue(body["og:image"].startswith("mxc://"))
  536. self.assertEqual(body["og:image:height"], 1)
  537. self.assertEqual(body["og:image:width"], 1)
  538. self.assertEqual(body["og:image:type"], "image/png")
  539. def test_oembed_rich(self):
  540. """Test an oEmbed endpoint which returns HTML content via the 'rich' type."""
  541. self.lookups["publish.twitter.com"] = [(IPv4Address, "10.1.2.3")]
  542. result = {
  543. "version": "1.0",
  544. "type": "rich",
  545. # Note that this provides the author, not the title.
  546. "author_name": "Alice",
  547. "html": "<div>Content Preview</div>",
  548. }
  549. end_content = json.dumps(result).encode("utf-8")
  550. channel = self.make_request(
  551. "GET",
  552. "preview_url?url=http://twitter.com/matrixdotorg/status/12345",
  553. shorthand=False,
  554. await_result=False,
  555. )
  556. self.pump()
  557. client = self.reactor.tcpClients[0][2].buildProtocol(None)
  558. server = AccumulatingProtocol()
  559. server.makeConnection(FakeTransport(client, self.reactor))
  560. client.makeConnection(FakeTransport(server, self.reactor))
  561. client.dataReceived(
  562. (
  563. b"HTTP/1.0 200 OK\r\nContent-Length: %d\r\n"
  564. b'Content-Type: application/json; charset="utf8"\r\n\r\n'
  565. )
  566. % (len(end_content),)
  567. + end_content
  568. )
  569. self.pump()
  570. self.assertEqual(channel.code, 200)
  571. body = channel.json_body
  572. self.assertEqual(
  573. body,
  574. {
  575. "og:url": "http://twitter.com/matrixdotorg/status/12345",
  576. "og:title": "Alice",
  577. "og:description": "Content Preview",
  578. },
  579. )
  580. def test_oembed_format(self):
  581. """Test an oEmbed endpoint which requires the format in the URL."""
  582. self.lookups["www.hulu.com"] = [(IPv4Address, "10.1.2.3")]
  583. result = {
  584. "version": "1.0",
  585. "type": "rich",
  586. "html": "<div>Content Preview</div>",
  587. }
  588. end_content = json.dumps(result).encode("utf-8")
  589. channel = self.make_request(
  590. "GET",
  591. "preview_url?url=http://www.hulu.com/watch/12345",
  592. shorthand=False,
  593. await_result=False,
  594. )
  595. self.pump()
  596. client = self.reactor.tcpClients[0][2].buildProtocol(None)
  597. server = AccumulatingProtocol()
  598. server.makeConnection(FakeTransport(client, self.reactor))
  599. client.makeConnection(FakeTransport(server, self.reactor))
  600. client.dataReceived(
  601. (
  602. b"HTTP/1.0 200 OK\r\nContent-Length: %d\r\n"
  603. b'Content-Type: application/json; charset="utf8"\r\n\r\n'
  604. )
  605. % (len(end_content),)
  606. + end_content
  607. )
  608. self.pump()
  609. # The {format} should have been turned into json.
  610. self.assertIn(b"/api/oembed.json", server.data)
  611. # A URL parameter of format=json should be provided.
  612. self.assertIn(b"format=json", server.data)
  613. self.assertEqual(channel.code, 200)
  614. body = channel.json_body
  615. self.assertEqual(
  616. body,
  617. {
  618. "og:url": "http://www.hulu.com/watch/12345",
  619. "og:description": "Content Preview",
  620. },
  621. )
  622. def test_oembed_autodiscovery(self):
  623. """
  624. Autodiscovery works by finding the link in the HTML response and then requesting an oEmbed URL.
  625. 1. Request a preview of a URL which is not known to the oEmbed code.
  626. 2. It returns HTML including a link to an oEmbed preview.
  627. 3. The oEmbed preview is requested and returns a URL for an image.
  628. 4. The image is requested for thumbnailing.
  629. """
  630. # This is a little cheesy in that we use the www subdomain (which isn't the
  631. # list of oEmbed patterns) to get "raw" HTML response.
  632. self.lookups["www.twitter.com"] = [(IPv4Address, "10.1.2.3")]
  633. self.lookups["publish.twitter.com"] = [(IPv4Address, "10.1.2.3")]
  634. self.lookups["cdn.twitter.com"] = [(IPv4Address, "10.1.2.3")]
  635. result = b"""
  636. <link rel="alternate" type="application/json+oembed"
  637. href="http://publish.twitter.com/oembed?url=http%3A%2F%2Fcdn.twitter.com%2Fmatrixdotorg%2Fstatus%2F12345&format=json"
  638. title="matrixdotorg" />
  639. """
  640. channel = self.make_request(
  641. "GET",
  642. "preview_url?url=http://www.twitter.com/matrixdotorg/status/12345",
  643. shorthand=False,
  644. await_result=False,
  645. )
  646. self.pump()
  647. client = self.reactor.tcpClients[0][2].buildProtocol(None)
  648. server = AccumulatingProtocol()
  649. server.makeConnection(FakeTransport(client, self.reactor))
  650. client.makeConnection(FakeTransport(server, self.reactor))
  651. client.dataReceived(
  652. (
  653. b"HTTP/1.0 200 OK\r\nContent-Length: %d\r\n"
  654. b'Content-Type: text/html; charset="utf8"\r\n\r\n'
  655. )
  656. % (len(result),)
  657. + result
  658. )
  659. self.pump()
  660. # The oEmbed response.
  661. result2 = {
  662. "version": "1.0",
  663. "type": "photo",
  664. "url": "http://cdn.twitter.com/matrixdotorg",
  665. }
  666. oembed_content = json.dumps(result2).encode("utf-8")
  667. # Ensure a second request is made to the oEmbed URL.
  668. client = self.reactor.tcpClients[1][2].buildProtocol(None)
  669. server = AccumulatingProtocol()
  670. server.makeConnection(FakeTransport(client, self.reactor))
  671. client.makeConnection(FakeTransport(server, self.reactor))
  672. client.dataReceived(
  673. (
  674. b"HTTP/1.0 200 OK\r\nContent-Length: %d\r\n"
  675. b'Content-Type: application/json; charset="utf8"\r\n\r\n'
  676. )
  677. % (len(oembed_content),)
  678. + oembed_content
  679. )
  680. self.pump()
  681. # Ensure the URL is what was requested.
  682. self.assertIn(b"/oembed?", server.data)
  683. # Ensure a third request is made to the photo URL.
  684. client = self.reactor.tcpClients[2][2].buildProtocol(None)
  685. server = AccumulatingProtocol()
  686. server.makeConnection(FakeTransport(client, self.reactor))
  687. client.makeConnection(FakeTransport(server, self.reactor))
  688. client.dataReceived(
  689. (
  690. b"HTTP/1.0 200 OK\r\nContent-Length: %d\r\n"
  691. b"Content-Type: image/png\r\n\r\n"
  692. )
  693. % (len(SMALL_PNG),)
  694. + SMALL_PNG
  695. )
  696. self.pump()
  697. # Ensure the URL is what was requested.
  698. self.assertIn(b"/matrixdotorg", server.data)
  699. self.assertEqual(channel.code, 200)
  700. body = channel.json_body
  701. self.assertEqual(
  702. body["og:url"], "http://www.twitter.com/matrixdotorg/status/12345"
  703. )
  704. self.assertTrue(body["og:image"].startswith("mxc://"))
  705. self.assertEqual(body["og:image:height"], 1)
  706. self.assertEqual(body["og:image:width"], 1)
  707. self.assertEqual(body["og:image:type"], "image/png")
  708. def _download_image(self):
  709. """Downloads an image into the URL cache.
  710. Returns:
  711. A (host, media_id) tuple representing the MXC URI of the image.
  712. """
  713. self.lookups["cdn.twitter.com"] = [(IPv4Address, "10.1.2.3")]
  714. channel = self.make_request(
  715. "GET",
  716. "preview_url?url=http://cdn.twitter.com/matrixdotorg",
  717. shorthand=False,
  718. await_result=False,
  719. )
  720. self.pump()
  721. client = self.reactor.tcpClients[0][2].buildProtocol(None)
  722. server = AccumulatingProtocol()
  723. server.makeConnection(FakeTransport(client, self.reactor))
  724. client.makeConnection(FakeTransport(server, self.reactor))
  725. client.dataReceived(
  726. b"HTTP/1.0 200 OK\r\nContent-Length: %d\r\nContent-Type: image/png\r\n\r\n"
  727. % (len(SMALL_PNG),)
  728. + SMALL_PNG
  729. )
  730. self.pump()
  731. self.assertEqual(channel.code, 200)
  732. body = channel.json_body
  733. mxc_uri = body["og:image"]
  734. host, _port, media_id = parse_and_validate_mxc_uri(mxc_uri)
  735. self.assertIsNone(_port)
  736. return host, media_id
  737. def test_storage_providers_exclude_files(self):
  738. """Test that files are not stored in or fetched from storage providers."""
  739. host, media_id = self._download_image()
  740. rel_file_path = self.preview_url.filepaths.url_cache_filepath_rel(media_id)
  741. media_store_path = os.path.join(self.media_store_path, rel_file_path)
  742. storage_provider_path = os.path.join(self.storage_path, rel_file_path)
  743. # Check storage
  744. self.assertTrue(os.path.isfile(media_store_path))
  745. self.assertFalse(
  746. os.path.isfile(storage_provider_path),
  747. "URL cache file was unexpectedly stored in a storage provider",
  748. )
  749. # Check fetching
  750. channel = self.make_request(
  751. "GET",
  752. f"download/{host}/{media_id}",
  753. shorthand=False,
  754. await_result=False,
  755. )
  756. self.pump()
  757. self.assertEqual(channel.code, 200)
  758. # Move cached file into the storage provider
  759. os.makedirs(os.path.dirname(storage_provider_path), exist_ok=True)
  760. os.rename(media_store_path, storage_provider_path)
  761. channel = self.make_request(
  762. "GET",
  763. f"download/{host}/{media_id}",
  764. shorthand=False,
  765. await_result=False,
  766. )
  767. self.pump()
  768. self.assertEqual(
  769. channel.code,
  770. 404,
  771. "URL cache file was unexpectedly retrieved from a storage provider",
  772. )
  773. def test_storage_providers_exclude_thumbnails(self):
  774. """Test that thumbnails are not stored in or fetched from storage providers."""
  775. host, media_id = self._download_image()
  776. rel_thumbnail_path = (
  777. self.preview_url.filepaths.url_cache_thumbnail_directory_rel(media_id)
  778. )
  779. media_store_thumbnail_path = os.path.join(
  780. self.media_store_path, rel_thumbnail_path
  781. )
  782. storage_provider_thumbnail_path = os.path.join(
  783. self.storage_path, rel_thumbnail_path
  784. )
  785. # Check storage
  786. self.assertTrue(os.path.isdir(media_store_thumbnail_path))
  787. self.assertFalse(
  788. os.path.isdir(storage_provider_thumbnail_path),
  789. "URL cache thumbnails were unexpectedly stored in a storage provider",
  790. )
  791. # Check fetching
  792. channel = self.make_request(
  793. "GET",
  794. f"thumbnail/{host}/{media_id}?width=32&height=32&method=scale",
  795. shorthand=False,
  796. await_result=False,
  797. )
  798. self.pump()
  799. self.assertEqual(channel.code, 200)
  800. # Remove the original, otherwise thumbnails will regenerate
  801. rel_file_path = self.preview_url.filepaths.url_cache_filepath_rel(media_id)
  802. media_store_path = os.path.join(self.media_store_path, rel_file_path)
  803. os.remove(media_store_path)
  804. # Move cached thumbnails into the storage provider
  805. os.makedirs(os.path.dirname(storage_provider_thumbnail_path), exist_ok=True)
  806. os.rename(media_store_thumbnail_path, storage_provider_thumbnail_path)
  807. channel = self.make_request(
  808. "GET",
  809. f"thumbnail/{host}/{media_id}?width=32&height=32&method=scale",
  810. shorthand=False,
  811. await_result=False,
  812. )
  813. self.pump()
  814. self.assertEqual(
  815. channel.code,
  816. 404,
  817. "URL cache thumbnail was unexpectedly retrieved from a storage provider",
  818. )
  819. def test_cache_expiry(self):
  820. """Test that URL cache files and thumbnails are cleaned up properly on expiry."""
  821. self.preview_url.clock = MockClock()
  822. _host, media_id = self._download_image()
  823. file_path = self.preview_url.filepaths.url_cache_filepath(media_id)
  824. file_dirs = self.preview_url.filepaths.url_cache_filepath_dirs_to_delete(
  825. media_id
  826. )
  827. thumbnail_dir = self.preview_url.filepaths.url_cache_thumbnail_directory(
  828. media_id
  829. )
  830. thumbnail_dirs = self.preview_url.filepaths.url_cache_thumbnail_dirs_to_delete(
  831. media_id
  832. )
  833. self.assertTrue(os.path.isfile(file_path))
  834. self.assertTrue(os.path.isdir(thumbnail_dir))
  835. self.preview_url.clock.advance_time_msec(IMAGE_CACHE_EXPIRY_MS + 1)
  836. self.get_success(self.preview_url._expire_url_cache_data())
  837. for path in [file_path] + file_dirs + [thumbnail_dir] + thumbnail_dirs:
  838. self.assertFalse(
  839. os.path.exists(path),
  840. f"{os.path.relpath(path, self.media_store_path)} was not deleted",
  841. )