test_url_preview.py 23 KB


  1. # -*- coding: utf-8 -*-
  2. # Copyright 2018 New Vector Ltd
  3. #
  4. # Licensed under the Apache License, Version 2.0 (the "License");
  5. # you may not use this file except in compliance with the License.
  6. # You may obtain a copy of the License at
  7. #
  8. # http://www.apache.org/licenses/LICENSE-2.0
  9. #
  10. # Unless required by applicable law or agreed to in writing, software
  11. # distributed under the License is distributed on an "AS IS" BASIS,
  12. # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  13. # See the License for the specific language governing permissions and
  14. # limitations under the License.
  15. import json
  16. import os
  17. import re
  18. from mock import patch
  19. import attr
  20. from twisted.internet._resolver import HostResolution
  21. from twisted.internet.address import IPv4Address, IPv6Address
  22. from twisted.internet.error import DNSLookupError
  23. from twisted.python.failure import Failure
  24. from twisted.test.proto_helpers import AccumulatingProtocol
  25. from twisted.web._newclient import ResponseDone
  26. from tests import unittest
  27. from tests.server import FakeTransport
  28. @attr.s
  29. class FakeResponse(object):
  30. version = attr.ib()
  31. code = attr.ib()
  32. phrase = attr.ib()
  33. headers = attr.ib()
  34. body = attr.ib()
  35. absoluteURI = attr.ib()
  36. @property
  37. def request(self):
  38. @attr.s
  39. class FakeTransport(object):
  40. absoluteURI = self.absoluteURI
  41. return FakeTransport()
  42. def deliverBody(self, protocol):
  43. protocol.dataReceived(self.body)
  44. protocol.connectionLost(Failure(ResponseDone()))
  45. class URLPreviewTests(unittest.HomeserverTestCase):
  46. hijack_auth = True
  47. user_id = "@test:user"
  48. end_content = (
  49. b"<html><head>"
  50. b'<meta property="og:title" content="~matrix~" />'
  51. b'<meta property="og:description" content="hi" />'
  52. b"</head></html>"
  53. )
  54. def make_homeserver(self, reactor, clock):
  55. config = self.default_config()
  56. config["url_preview_enabled"] = True
  57. config["max_spider_size"] = 9999999
  58. config["url_preview_ip_range_blacklist"] = (
  59. "192.168.1.1",
  60. "1.0.0.0/8",
  61. "3fff:ffff:ffff:ffff:ffff:ffff:ffff:ffff",
  62. "2001:800::/21",
  63. )
  64. config["url_preview_ip_range_whitelist"] = ("1.1.1.1",)
  65. config["url_preview_url_blacklist"] = []
  66. config["url_preview_accept_language"] = [
  67. "en-UK",
  68. "en-US;q=0.9",
  69. "fr;q=0.8",
  70. "*;q=0.7",
  71. ]
  72. self.storage_path = self.mktemp()
  73. self.media_store_path = self.mktemp()
  74. os.mkdir(self.storage_path)
  75. os.mkdir(self.media_store_path)
  76. config["media_store_path"] = self.media_store_path
  77. provider_config = {
  78. "module": "synapse.rest.media.v1.storage_provider.FileStorageProviderBackend",
  79. "store_local": True,
  80. "store_synchronous": False,
  81. "store_remote": True,
  82. "config": {"directory": self.storage_path},
  83. }
  84. config["media_storage_providers"] = [provider_config]
  85. hs = self.setup_test_homeserver(config=config)
  86. return hs
  87. def prepare(self, reactor, clock, hs):
  88. self.media_repo = hs.get_media_repository_resource()
  89. self.preview_url = self.media_repo.children[b"preview_url"]
  90. self.lookups = {}
  91. class Resolver(object):
  92. def resolveHostName(
  93. _self,
  94. resolutionReceiver,
  95. hostName,
  96. portNumber=0,
  97. addressTypes=None,
  98. transportSemantics="TCP",
  99. ):
  100. resolution = HostResolution(hostName)
  101. resolutionReceiver.resolutionBegan(resolution)
  102. if hostName not in self.lookups:
  103. raise DNSLookupError("OH NO")
  104. for i in self.lookups[hostName]:
  105. resolutionReceiver.addressResolved(i[0]("TCP", i[1], portNumber))
  106. resolutionReceiver.resolutionComplete()
  107. return resolutionReceiver
  108. self.reactor.nameResolver = Resolver()
  109. def test_cache_returns_correct_type(self):
  110. self.lookups["matrix.org"] = [(IPv4Address, "10.1.2.3")]
  111. request, channel = self.make_request(
  112. "GET", "url_preview?url=http://matrix.org", shorthand=False
  113. )
  114. request.render(self.preview_url)
  115. self.pump()
  116. client = self.reactor.tcpClients[0][2].buildProtocol(None)
  117. server = AccumulatingProtocol()
  118. server.makeConnection(FakeTransport(client, self.reactor))
  119. client.makeConnection(FakeTransport(server, self.reactor))
  120. client.dataReceived(
  121. b"HTTP/1.0 200 OK\r\nContent-Length: %d\r\nContent-Type: text/html\r\n\r\n"
  122. % (len(self.end_content),)
  123. + self.end_content
  124. )
  125. self.pump()
  126. self.assertEqual(channel.code, 200)
  127. self.assertEqual(
  128. channel.json_body, {"og:title": "~matrix~", "og:description": "hi"}
  129. )
  130. # Check the cache returns the correct response
  131. request, channel = self.make_request(
  132. "GET", "url_preview?url=http://matrix.org", shorthand=False
  133. )
  134. request.render(self.preview_url)
  135. self.pump()
  136. # Check the cache response has the same content
  137. self.assertEqual(channel.code, 200)
  138. self.assertEqual(
  139. channel.json_body, {"og:title": "~matrix~", "og:description": "hi"}
  140. )
  141. # Clear the in-memory cache
  142. self.assertIn("http://matrix.org", self.preview_url._cache)
  143. self.preview_url._cache.pop("http://matrix.org")
  144. self.assertNotIn("http://matrix.org", self.preview_url._cache)
  145. # Check the database cache returns the correct response
  146. request, channel = self.make_request(
  147. "GET", "url_preview?url=http://matrix.org", shorthand=False
  148. )
  149. request.render(self.preview_url)
  150. self.pump()
  151. # Check the cache response has the same content
  152. self.assertEqual(channel.code, 200)
  153. self.assertEqual(
  154. channel.json_body, {"og:title": "~matrix~", "og:description": "hi"}
  155. )
  156. def test_non_ascii_preview_httpequiv(self):
  157. self.lookups["matrix.org"] = [(IPv4Address, "10.1.2.3")]
  158. end_content = (
  159. b"<html><head>"
  160. b'<meta http-equiv="Content-Type" content="text/html; charset=windows-1251"/>'
  161. b'<meta property="og:title" content="\xe4\xea\xe0" />'
  162. b'<meta property="og:description" content="hi" />'
  163. b"</head></html>"
  164. )
  165. request, channel = self.make_request(
  166. "GET", "url_preview?url=http://matrix.org", shorthand=False
  167. )
  168. request.render(self.preview_url)
  169. self.pump()
  170. client = self.reactor.tcpClients[0][2].buildProtocol(None)
  171. server = AccumulatingProtocol()
  172. server.makeConnection(FakeTransport(client, self.reactor))
  173. client.makeConnection(FakeTransport(server, self.reactor))
  174. client.dataReceived(
  175. (
  176. b"HTTP/1.0 200 OK\r\nContent-Length: %d\r\n"
  177. b'Content-Type: text/html; charset="utf8"\r\n\r\n'
  178. )
  179. % (len(end_content),)
  180. + end_content
  181. )
  182. self.pump()
  183. self.assertEqual(channel.code, 200)
  184. self.assertEqual(channel.json_body["og:title"], "\u0434\u043a\u0430")
  185. def test_non_ascii_preview_content_type(self):
  186. self.lookups["matrix.org"] = [(IPv4Address, "10.1.2.3")]
  187. end_content = (
  188. b"<html><head>"
  189. b'<meta property="og:title" content="\xe4\xea\xe0" />'
  190. b'<meta property="og:description" content="hi" />'
  191. b"</head></html>"
  192. )
  193. request, channel = self.make_request(
  194. "GET", "url_preview?url=http://matrix.org", shorthand=False
  195. )
  196. request.render(self.preview_url)
  197. self.pump()
  198. client = self.reactor.tcpClients[0][2].buildProtocol(None)
  199. server = AccumulatingProtocol()
  200. server.makeConnection(FakeTransport(client, self.reactor))
  201. client.makeConnection(FakeTransport(server, self.reactor))
  202. client.dataReceived(
  203. (
  204. b"HTTP/1.0 200 OK\r\nContent-Length: %d\r\n"
  205. b'Content-Type: text/html; charset="windows-1251"\r\n\r\n'
  206. )
  207. % (len(end_content),)
  208. + end_content
  209. )
  210. self.pump()
  211. self.assertEqual(channel.code, 200)
  212. self.assertEqual(channel.json_body["og:title"], "\u0434\u043a\u0430")
  213. def test_overlong_title(self):
  214. self.lookups["matrix.org"] = [(IPv4Address, "10.1.2.3")]
  215. end_content = (
  216. b"<html><head>"
  217. b"<title>" + b"x" * 2000 + b"</title>"
  218. b'<meta property="og:description" content="hi" />'
  219. b"</head></html>"
  220. )
  221. request, channel = self.make_request(
  222. "GET", "url_preview?url=http://matrix.org", shorthand=False
  223. )
  224. request.render(self.preview_url)
  225. self.pump()
  226. client = self.reactor.tcpClients[0][2].buildProtocol(None)
  227. server = AccumulatingProtocol()
  228. server.makeConnection(FakeTransport(client, self.reactor))
  229. client.makeConnection(FakeTransport(server, self.reactor))
  230. client.dataReceived(
  231. (
  232. b"HTTP/1.0 200 OK\r\nContent-Length: %d\r\n"
  233. b'Content-Type: text/html; charset="windows-1251"\r\n\r\n'
  234. )
  235. % (len(end_content),)
  236. + end_content
  237. )
  238. self.pump()
  239. self.assertEqual(channel.code, 200)
  240. res = channel.json_body
  241. # We should only see the `og:description` field, as `title` is too long and should be stripped out
  242. self.assertCountEqual(["og:description"], res.keys())
  243. def test_ipaddr(self):
  244. """
  245. IP addresses can be previewed directly.
  246. """
  247. self.lookups["example.com"] = [(IPv4Address, "10.1.2.3")]
  248. request, channel = self.make_request(
  249. "GET", "url_preview?url=http://example.com", shorthand=False
  250. )
  251. request.render(self.preview_url)
  252. self.pump()
  253. client = self.reactor.tcpClients[0][2].buildProtocol(None)
  254. server = AccumulatingProtocol()
  255. server.makeConnection(FakeTransport(client, self.reactor))
  256. client.makeConnection(FakeTransport(server, self.reactor))
  257. client.dataReceived(
  258. b"HTTP/1.0 200 OK\r\nContent-Length: %d\r\nContent-Type: text/html\r\n\r\n"
  259. % (len(self.end_content),)
  260. + self.end_content
  261. )
  262. self.pump()
  263. self.assertEqual(channel.code, 200)
  264. self.assertEqual(
  265. channel.json_body, {"og:title": "~matrix~", "og:description": "hi"}
  266. )
  267. def test_blacklisted_ip_specific(self):
  268. """
  269. Blacklisted IP addresses, found via DNS, are not spidered.
  270. """
  271. self.lookups["example.com"] = [(IPv4Address, "192.168.1.1")]
  272. request, channel = self.make_request(
  273. "GET", "url_preview?url=http://example.com", shorthand=False
  274. )
  275. request.render(self.preview_url)
  276. self.pump()
  277. # No requests made.
  278. self.assertEqual(len(self.reactor.tcpClients), 0)
  279. self.assertEqual(channel.code, 502)
  280. self.assertEqual(
  281. channel.json_body,
  282. {
  283. "errcode": "M_UNKNOWN",
  284. "error": "DNS resolution failure during URL preview generation",
  285. },
  286. )
  287. def test_blacklisted_ip_range(self):
  288. """
  289. Blacklisted IP ranges, IPs found over DNS, are not spidered.
  290. """
  291. self.lookups["example.com"] = [(IPv4Address, "1.1.1.2")]
  292. request, channel = self.make_request(
  293. "GET", "url_preview?url=http://example.com", shorthand=False
  294. )
  295. request.render(self.preview_url)
  296. self.pump()
  297. self.assertEqual(channel.code, 502)
  298. self.assertEqual(
  299. channel.json_body,
  300. {
  301. "errcode": "M_UNKNOWN",
  302. "error": "DNS resolution failure during URL preview generation",
  303. },
  304. )
  305. def test_blacklisted_ip_specific_direct(self):
  306. """
  307. Blacklisted IP addresses, accessed directly, are not spidered.
  308. """
  309. request, channel = self.make_request(
  310. "GET", "url_preview?url=http://192.168.1.1", shorthand=False
  311. )
  312. request.render(self.preview_url)
  313. self.pump()
  314. # No requests made.
  315. self.assertEqual(len(self.reactor.tcpClients), 0)
  316. self.assertEqual(
  317. channel.json_body,
  318. {
  319. "errcode": "M_UNKNOWN",
  320. "error": "IP address blocked by IP blacklist entry",
  321. },
  322. )
  323. self.assertEqual(channel.code, 403)
  324. def test_blacklisted_ip_range_direct(self):
  325. """
  326. Blacklisted IP ranges, accessed directly, are not spidered.
  327. """
  328. request, channel = self.make_request(
  329. "GET", "url_preview?url=http://1.1.1.2", shorthand=False
  330. )
  331. request.render(self.preview_url)
  332. self.pump()
  333. self.assertEqual(channel.code, 403)
  334. self.assertEqual(
  335. channel.json_body,
  336. {
  337. "errcode": "M_UNKNOWN",
  338. "error": "IP address blocked by IP blacklist entry",
  339. },
  340. )
  341. def test_blacklisted_ip_range_whitelisted_ip(self):
  342. """
  343. Blacklisted but then subsequently whitelisted IP addresses can be
  344. spidered.
  345. """
  346. self.lookups["example.com"] = [(IPv4Address, "1.1.1.1")]
  347. request, channel = self.make_request(
  348. "GET", "url_preview?url=http://example.com", shorthand=False
  349. )
  350. request.render(self.preview_url)
  351. self.pump()
  352. client = self.reactor.tcpClients[0][2].buildProtocol(None)
  353. server = AccumulatingProtocol()
  354. server.makeConnection(FakeTransport(client, self.reactor))
  355. client.makeConnection(FakeTransport(server, self.reactor))
  356. client.dataReceived(
  357. b"HTTP/1.0 200 OK\r\nContent-Length: %d\r\nContent-Type: text/html\r\n\r\n"
  358. % (len(self.end_content),)
  359. + self.end_content
  360. )
  361. self.pump()
  362. self.assertEqual(channel.code, 200)
  363. self.assertEqual(
  364. channel.json_body, {"og:title": "~matrix~", "og:description": "hi"}
  365. )
  366. def test_blacklisted_ip_with_external_ip(self):
  367. """
  368. If a hostname resolves a blacklisted IP, even if there's a
  369. non-blacklisted one, it will be rejected.
  370. """
  371. # Hardcode the URL resolving to the IP we want.
  372. self.lookups["example.com"] = [
  373. (IPv4Address, "1.1.1.2"),
  374. (IPv4Address, "10.1.2.3"),
  375. ]
  376. request, channel = self.make_request(
  377. "GET", "url_preview?url=http://example.com", shorthand=False
  378. )
  379. request.render(self.preview_url)
  380. self.pump()
  381. self.assertEqual(channel.code, 502)
  382. self.assertEqual(
  383. channel.json_body,
  384. {
  385. "errcode": "M_UNKNOWN",
  386. "error": "DNS resolution failure during URL preview generation",
  387. },
  388. )
  389. def test_blacklisted_ipv6_specific(self):
  390. """
  391. Blacklisted IP addresses, found via DNS, are not spidered.
  392. """
  393. self.lookups["example.com"] = [
  394. (IPv6Address, "3fff:ffff:ffff:ffff:ffff:ffff:ffff:ffff")
  395. ]
  396. request, channel = self.make_request(
  397. "GET", "url_preview?url=http://example.com", shorthand=False
  398. )
  399. request.render(self.preview_url)
  400. self.pump()
  401. # No requests made.
  402. self.assertEqual(len(self.reactor.tcpClients), 0)
  403. self.assertEqual(channel.code, 502)
  404. self.assertEqual(
  405. channel.json_body,
  406. {
  407. "errcode": "M_UNKNOWN",
  408. "error": "DNS resolution failure during URL preview generation",
  409. },
  410. )
  411. def test_blacklisted_ipv6_range(self):
  412. """
  413. Blacklisted IP ranges, IPs found over DNS, are not spidered.
  414. """
  415. self.lookups["example.com"] = [(IPv6Address, "2001:800::1")]
  416. request, channel = self.make_request(
  417. "GET", "url_preview?url=http://example.com", shorthand=False
  418. )
  419. request.render(self.preview_url)
  420. self.pump()
  421. self.assertEqual(channel.code, 502)
  422. self.assertEqual(
  423. channel.json_body,
  424. {
  425. "errcode": "M_UNKNOWN",
  426. "error": "DNS resolution failure during URL preview generation",
  427. },
  428. )
  429. def test_OPTIONS(self):
  430. """
  431. OPTIONS returns the OPTIONS.
  432. """
  433. request, channel = self.make_request(
  434. "OPTIONS", "url_preview?url=http://example.com", shorthand=False
  435. )
  436. request.render(self.preview_url)
  437. self.pump()
  438. self.assertEqual(channel.code, 200)
  439. self.assertEqual(channel.json_body, {})
  440. def test_accept_language_config_option(self):
  441. """
  442. Accept-Language header is sent to the remote server
  443. """
  444. self.lookups["example.com"] = [(IPv4Address, "10.1.2.3")]
  445. # Build and make a request to the server
  446. request, channel = self.make_request(
  447. "GET", "url_preview?url=http://example.com", shorthand=False
  448. )
  449. request.render(self.preview_url)
  450. self.pump()
  451. # Extract Synapse's tcp client
  452. client = self.reactor.tcpClients[0][2].buildProtocol(None)
  453. # Build a fake remote server to reply with
  454. server = AccumulatingProtocol()
  455. # Connect the two together
  456. server.makeConnection(FakeTransport(client, self.reactor))
  457. client.makeConnection(FakeTransport(server, self.reactor))
  458. # Tell Synapse that it has received some data from the remote server
  459. client.dataReceived(
  460. b"HTTP/1.0 200 OK\r\nContent-Length: %d\r\nContent-Type: text/html\r\n\r\n"
  461. % (len(self.end_content),)
  462. + self.end_content
  463. )
  464. # Move the reactor along until we get a response on our original channel
  465. self.pump()
  466. self.assertEqual(channel.code, 200)
  467. self.assertEqual(
  468. channel.json_body, {"og:title": "~matrix~", "og:description": "hi"}
  469. )
  470. # Check that the server received the Accept-Language header as part
  471. # of the request from Synapse
  472. self.assertIn(
  473. (
  474. b"Accept-Language: en-UK\r\n"
  475. b"Accept-Language: en-US;q=0.9\r\n"
  476. b"Accept-Language: fr;q=0.8\r\n"
  477. b"Accept-Language: *;q=0.7"
  478. ),
  479. server.data,
  480. )
  481. def test_oembed_photo(self):
  482. """Test an oEmbed endpoint which returns a 'photo' type which redirects the preview to a new URL."""
  483. # Route the HTTP version to an HTTP endpoint so that the tests work.
  484. with patch.dict(
  485. "synapse.rest.media.v1.preview_url_resource._oembed_patterns",
  486. {
  487. re.compile(
  488. r"http://twitter\.com/.+/status/.+"
  489. ): "http://publish.twitter.com/oembed",
  490. },
  491. clear=True,
  492. ):
  493. self.lookups["publish.twitter.com"] = [(IPv4Address, "10.1.2.3")]
  494. self.lookups["cdn.twitter.com"] = [(IPv4Address, "10.1.2.3")]
  495. result = {
  496. "version": "1.0",
  497. "type": "photo",
  498. "url": "http://cdn.twitter.com/matrixdotorg",
  499. }
  500. oembed_content = json.dumps(result).encode("utf-8")
  501. end_content = (
  502. b"<html><head>"
  503. b"<title>Some Title</title>"
  504. b'<meta property="og:description" content="hi" />'
  505. b"</head></html>"
  506. )
  507. request, channel = self.make_request(
  508. "GET",
  509. "url_preview?url=http://twitter.com/matrixdotorg/status/12345",
  510. shorthand=False,
  511. )
  512. request.render(self.preview_url)
  513. self.pump()
  514. client = self.reactor.tcpClients[0][2].buildProtocol(None)
  515. server = AccumulatingProtocol()
  516. server.makeConnection(FakeTransport(client, self.reactor))
  517. client.makeConnection(FakeTransport(server, self.reactor))
  518. client.dataReceived(
  519. (
  520. b"HTTP/1.0 200 OK\r\nContent-Length: %d\r\n"
  521. b'Content-Type: application/json; charset="utf8"\r\n\r\n'
  522. )
  523. % (len(oembed_content),)
  524. + oembed_content
  525. )
  526. self.pump()
  527. client = self.reactor.tcpClients[1][2].buildProtocol(None)
  528. server = AccumulatingProtocol()
  529. server.makeConnection(FakeTransport(client, self.reactor))
  530. client.makeConnection(FakeTransport(server, self.reactor))
  531. client.dataReceived(
  532. (
  533. b"HTTP/1.0 200 OK\r\nContent-Length: %d\r\n"
  534. b'Content-Type: text/html; charset="utf8"\r\n\r\n'
  535. )
  536. % (len(end_content),)
  537. + end_content
  538. )
  539. self.pump()
  540. self.assertEqual(channel.code, 200)
  541. self.assertEqual(
  542. channel.json_body, {"og:title": "Some Title", "og:description": "hi"}
  543. )
  544. def test_oembed_rich(self):
  545. """Test an oEmbed endpoint which returns HTML content via the 'rich' type."""
  546. # Route the HTTP version to an HTTP endpoint so that the tests work.
  547. with patch.dict(
  548. "synapse.rest.media.v1.preview_url_resource._oembed_patterns",
  549. {
  550. re.compile(
  551. r"http://twitter\.com/.+/status/.+"
  552. ): "http://publish.twitter.com/oembed",
  553. },
  554. clear=True,
  555. ):
  556. self.lookups["publish.twitter.com"] = [(IPv4Address, "10.1.2.3")]
  557. result = {
  558. "version": "1.0",
  559. "type": "rich",
  560. "html": "<div>Content Preview</div>",
  561. }
  562. end_content = json.dumps(result).encode("utf-8")
  563. request, channel = self.make_request(
  564. "GET",
  565. "url_preview?url=http://twitter.com/matrixdotorg/status/12345",
  566. shorthand=False,
  567. )
  568. request.render(self.preview_url)
  569. self.pump()
  570. client = self.reactor.tcpClients[0][2].buildProtocol(None)
  571. server = AccumulatingProtocol()
  572. server.makeConnection(FakeTransport(client, self.reactor))
  573. client.makeConnection(FakeTransport(server, self.reactor))
  574. client.dataReceived(
  575. (
  576. b"HTTP/1.0 200 OK\r\nContent-Length: %d\r\n"
  577. b'Content-Type: application/json; charset="utf8"\r\n\r\n'
  578. )
  579. % (len(end_content),)
  580. + end_content
  581. )
  582. self.pump()
  583. self.assertEqual(channel.code, 200)
  584. self.assertEqual(
  585. channel.json_body,
  586. {"og:title": None, "og:description": "Content Preview"},
  587. )