test_url_preview.py 22 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668
  1. # -*- coding: utf-8 -*-
  2. # Copyright 2018 New Vector Ltd
  3. #
  4. # Licensed under the Apache License, Version 2.0 (the "License");
  5. # you may not use this file except in compliance with the License.
  6. # You may obtain a copy of the License at
  7. #
  8. # http://www.apache.org/licenses/LICENSE-2.0
  9. #
  10. # Unless required by applicable law or agreed to in writing, software
  11. # distributed under the License is distributed on an "AS IS" BASIS,
  12. # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  13. # See the License for the specific language governing permissions and
  14. # limitations under the License.
  15. import json
  16. import os
  17. import re
  18. from mock import patch
  19. from twisted.internet._resolver import HostResolution
  20. from twisted.internet.address import IPv4Address, IPv6Address
  21. from twisted.internet.error import DNSLookupError
  22. from twisted.test.proto_helpers import AccumulatingProtocol
  23. from tests import unittest
  24. from tests.server import FakeTransport
  25. try:
  26. import lxml
  27. except ImportError:
  28. lxml = None
  29. class URLPreviewTests(unittest.HomeserverTestCase):
  30. if not lxml:
  31. skip = "url preview feature requires lxml"
  32. hijack_auth = True
  33. user_id = "@test:user"
  34. end_content = (
  35. b"<html><head>"
  36. b'<meta property="og:title" content="~matrix~" />'
  37. b'<meta property="og:description" content="hi" />'
  38. b"</head></html>"
  39. )
  40. def make_homeserver(self, reactor, clock):
  41. config = self.default_config()
  42. config["url_preview_enabled"] = True
  43. config["max_spider_size"] = 9999999
  44. config["url_preview_ip_range_blacklist"] = (
  45. "192.168.1.1",
  46. "1.0.0.0/8",
  47. "3fff:ffff:ffff:ffff:ffff:ffff:ffff:ffff",
  48. "2001:800::/21",
  49. )
  50. config["url_preview_ip_range_whitelist"] = ("1.1.1.1",)
  51. config["url_preview_url_blacklist"] = []
  52. config["url_preview_accept_language"] = [
  53. "en-UK",
  54. "en-US;q=0.9",
  55. "fr;q=0.8",
  56. "*;q=0.7",
  57. ]
  58. self.storage_path = self.mktemp()
  59. self.media_store_path = self.mktemp()
  60. os.mkdir(self.storage_path)
  61. os.mkdir(self.media_store_path)
  62. config["media_store_path"] = self.media_store_path
  63. provider_config = {
  64. "module": "synapse.rest.media.v1.storage_provider.FileStorageProviderBackend",
  65. "store_local": True,
  66. "store_synchronous": False,
  67. "store_remote": True,
  68. "config": {"directory": self.storage_path},
  69. }
  70. config["media_storage_providers"] = [provider_config]
  71. hs = self.setup_test_homeserver(config=config)
  72. return hs
  73. def prepare(self, reactor, clock, hs):
  74. self.media_repo = hs.get_media_repository_resource()
  75. self.preview_url = self.media_repo.children[b"preview_url"]
  76. self.lookups = {}
  77. class Resolver:
  78. def resolveHostName(
  79. _self,
  80. resolutionReceiver,
  81. hostName,
  82. portNumber=0,
  83. addressTypes=None,
  84. transportSemantics="TCP",
  85. ):
  86. resolution = HostResolution(hostName)
  87. resolutionReceiver.resolutionBegan(resolution)
  88. if hostName not in self.lookups:
  89. raise DNSLookupError("OH NO")
  90. for i in self.lookups[hostName]:
  91. resolutionReceiver.addressResolved(i[0]("TCP", i[1], portNumber))
  92. resolutionReceiver.resolutionComplete()
  93. return resolutionReceiver
  94. self.reactor.nameResolver = Resolver()
  95. def create_test_resource(self):
  96. return self.hs.get_media_repository_resource()
  97. def test_cache_returns_correct_type(self):
  98. self.lookups["matrix.org"] = [(IPv4Address, "10.1.2.3")]
  99. channel = self.make_request(
  100. "GET",
  101. "preview_url?url=http://matrix.org",
  102. shorthand=False,
  103. await_result=False,
  104. )
  105. self.pump()
  106. client = self.reactor.tcpClients[0][2].buildProtocol(None)
  107. server = AccumulatingProtocol()
  108. server.makeConnection(FakeTransport(client, self.reactor))
  109. client.makeConnection(FakeTransport(server, self.reactor))
  110. client.dataReceived(
  111. b"HTTP/1.0 200 OK\r\nContent-Length: %d\r\nContent-Type: text/html\r\n\r\n"
  112. % (len(self.end_content),)
  113. + self.end_content
  114. )
  115. self.pump()
  116. self.assertEqual(channel.code, 200)
  117. self.assertEqual(
  118. channel.json_body, {"og:title": "~matrix~", "og:description": "hi"}
  119. )
  120. # Check the cache returns the correct response
  121. channel = self.make_request(
  122. "GET", "preview_url?url=http://matrix.org", shorthand=False
  123. )
  124. # Check the cache response has the same content
  125. self.assertEqual(channel.code, 200)
  126. self.assertEqual(
  127. channel.json_body, {"og:title": "~matrix~", "og:description": "hi"}
  128. )
  129. # Clear the in-memory cache
  130. self.assertIn("http://matrix.org", self.preview_url._cache)
  131. self.preview_url._cache.pop("http://matrix.org")
  132. self.assertNotIn("http://matrix.org", self.preview_url._cache)
  133. # Check the database cache returns the correct response
  134. channel = self.make_request(
  135. "GET", "preview_url?url=http://matrix.org", shorthand=False
  136. )
  137. # Check the cache response has the same content
  138. self.assertEqual(channel.code, 200)
  139. self.assertEqual(
  140. channel.json_body, {"og:title": "~matrix~", "og:description": "hi"}
  141. )
  142. def test_non_ascii_preview_httpequiv(self):
  143. self.lookups["matrix.org"] = [(IPv4Address, "10.1.2.3")]
  144. end_content = (
  145. b"<html><head>"
  146. b'<meta http-equiv="Content-Type" content="text/html; charset=windows-1251"/>'
  147. b'<meta property="og:title" content="\xe4\xea\xe0" />'
  148. b'<meta property="og:description" content="hi" />'
  149. b"</head></html>"
  150. )
  151. channel = self.make_request(
  152. "GET",
  153. "preview_url?url=http://matrix.org",
  154. shorthand=False,
  155. await_result=False,
  156. )
  157. self.pump()
  158. client = self.reactor.tcpClients[0][2].buildProtocol(None)
  159. server = AccumulatingProtocol()
  160. server.makeConnection(FakeTransport(client, self.reactor))
  161. client.makeConnection(FakeTransport(server, self.reactor))
  162. client.dataReceived(
  163. (
  164. b"HTTP/1.0 200 OK\r\nContent-Length: %d\r\n"
  165. b'Content-Type: text/html; charset="utf8"\r\n\r\n'
  166. )
  167. % (len(end_content),)
  168. + end_content
  169. )
  170. self.pump()
  171. self.assertEqual(channel.code, 200)
  172. self.assertEqual(channel.json_body["og:title"], "\u0434\u043a\u0430")
  173. def test_non_ascii_preview_content_type(self):
  174. self.lookups["matrix.org"] = [(IPv4Address, "10.1.2.3")]
  175. end_content = (
  176. b"<html><head>"
  177. b'<meta property="og:title" content="\xe4\xea\xe0" />'
  178. b'<meta property="og:description" content="hi" />'
  179. b"</head></html>"
  180. )
  181. channel = self.make_request(
  182. "GET",
  183. "preview_url?url=http://matrix.org",
  184. shorthand=False,
  185. await_result=False,
  186. )
  187. self.pump()
  188. client = self.reactor.tcpClients[0][2].buildProtocol(None)
  189. server = AccumulatingProtocol()
  190. server.makeConnection(FakeTransport(client, self.reactor))
  191. client.makeConnection(FakeTransport(server, self.reactor))
  192. client.dataReceived(
  193. (
  194. b"HTTP/1.0 200 OK\r\nContent-Length: %d\r\n"
  195. b'Content-Type: text/html; charset="windows-1251"\r\n\r\n'
  196. )
  197. % (len(end_content),)
  198. + end_content
  199. )
  200. self.pump()
  201. self.assertEqual(channel.code, 200)
  202. self.assertEqual(channel.json_body["og:title"], "\u0434\u043a\u0430")
  203. def test_overlong_title(self):
  204. self.lookups["matrix.org"] = [(IPv4Address, "10.1.2.3")]
  205. end_content = (
  206. b"<html><head>"
  207. b"<title>" + b"x" * 2000 + b"</title>"
  208. b'<meta property="og:description" content="hi" />'
  209. b"</head></html>"
  210. )
  211. channel = self.make_request(
  212. "GET",
  213. "preview_url?url=http://matrix.org",
  214. shorthand=False,
  215. await_result=False,
  216. )
  217. self.pump()
  218. client = self.reactor.tcpClients[0][2].buildProtocol(None)
  219. server = AccumulatingProtocol()
  220. server.makeConnection(FakeTransport(client, self.reactor))
  221. client.makeConnection(FakeTransport(server, self.reactor))
  222. client.dataReceived(
  223. (
  224. b"HTTP/1.0 200 OK\r\nContent-Length: %d\r\n"
  225. b'Content-Type: text/html; charset="windows-1251"\r\n\r\n'
  226. )
  227. % (len(end_content),)
  228. + end_content
  229. )
  230. self.pump()
  231. self.assertEqual(channel.code, 200)
  232. res = channel.json_body
  233. # We should only see the `og:description` field, as `title` is too long and should be stripped out
  234. self.assertCountEqual(["og:description"], res.keys())
  235. def test_ipaddr(self):
  236. """
  237. IP addresses can be previewed directly.
  238. """
  239. self.lookups["example.com"] = [(IPv4Address, "10.1.2.3")]
  240. channel = self.make_request(
  241. "GET",
  242. "preview_url?url=http://example.com",
  243. shorthand=False,
  244. await_result=False,
  245. )
  246. self.pump()
  247. client = self.reactor.tcpClients[0][2].buildProtocol(None)
  248. server = AccumulatingProtocol()
  249. server.makeConnection(FakeTransport(client, self.reactor))
  250. client.makeConnection(FakeTransport(server, self.reactor))
  251. client.dataReceived(
  252. b"HTTP/1.0 200 OK\r\nContent-Length: %d\r\nContent-Type: text/html\r\n\r\n"
  253. % (len(self.end_content),)
  254. + self.end_content
  255. )
  256. self.pump()
  257. self.assertEqual(channel.code, 200)
  258. self.assertEqual(
  259. channel.json_body, {"og:title": "~matrix~", "og:description": "hi"}
  260. )
  261. def test_blacklisted_ip_specific(self):
  262. """
  263. Blacklisted IP addresses, found via DNS, are not spidered.
  264. """
  265. self.lookups["example.com"] = [(IPv4Address, "192.168.1.1")]
  266. channel = self.make_request(
  267. "GET", "preview_url?url=http://example.com", shorthand=False
  268. )
  269. # No requests made.
  270. self.assertEqual(len(self.reactor.tcpClients), 0)
  271. self.assertEqual(channel.code, 502)
  272. self.assertEqual(
  273. channel.json_body,
  274. {
  275. "errcode": "M_UNKNOWN",
  276. "error": "DNS resolution failure during URL preview generation",
  277. },
  278. )
  279. def test_blacklisted_ip_range(self):
  280. """
  281. Blacklisted IP ranges, IPs found over DNS, are not spidered.
  282. """
  283. self.lookups["example.com"] = [(IPv4Address, "1.1.1.2")]
  284. channel = self.make_request(
  285. "GET", "preview_url?url=http://example.com", shorthand=False
  286. )
  287. self.assertEqual(channel.code, 502)
  288. self.assertEqual(
  289. channel.json_body,
  290. {
  291. "errcode": "M_UNKNOWN",
  292. "error": "DNS resolution failure during URL preview generation",
  293. },
  294. )
  295. def test_blacklisted_ip_specific_direct(self):
  296. """
  297. Blacklisted IP addresses, accessed directly, are not spidered.
  298. """
  299. channel = self.make_request(
  300. "GET", "preview_url?url=http://192.168.1.1", shorthand=False
  301. )
  302. # No requests made.
  303. self.assertEqual(len(self.reactor.tcpClients), 0)
  304. self.assertEqual(
  305. channel.json_body,
  306. {
  307. "errcode": "M_UNKNOWN",
  308. "error": "IP address blocked by IP blacklist entry",
  309. },
  310. )
  311. self.assertEqual(channel.code, 403)
  312. def test_blacklisted_ip_range_direct(self):
  313. """
  314. Blacklisted IP ranges, accessed directly, are not spidered.
  315. """
  316. channel = self.make_request(
  317. "GET", "preview_url?url=http://1.1.1.2", shorthand=False
  318. )
  319. self.assertEqual(channel.code, 403)
  320. self.assertEqual(
  321. channel.json_body,
  322. {
  323. "errcode": "M_UNKNOWN",
  324. "error": "IP address blocked by IP blacklist entry",
  325. },
  326. )
  327. def test_blacklisted_ip_range_whitelisted_ip(self):
  328. """
  329. Blacklisted but then subsequently whitelisted IP addresses can be
  330. spidered.
  331. """
  332. self.lookups["example.com"] = [(IPv4Address, "1.1.1.1")]
  333. channel = self.make_request(
  334. "GET",
  335. "preview_url?url=http://example.com",
  336. shorthand=False,
  337. await_result=False,
  338. )
  339. self.pump()
  340. client = self.reactor.tcpClients[0][2].buildProtocol(None)
  341. server = AccumulatingProtocol()
  342. server.makeConnection(FakeTransport(client, self.reactor))
  343. client.makeConnection(FakeTransport(server, self.reactor))
  344. client.dataReceived(
  345. b"HTTP/1.0 200 OK\r\nContent-Length: %d\r\nContent-Type: text/html\r\n\r\n"
  346. % (len(self.end_content),)
  347. + self.end_content
  348. )
  349. self.pump()
  350. self.assertEqual(channel.code, 200)
  351. self.assertEqual(
  352. channel.json_body, {"og:title": "~matrix~", "og:description": "hi"}
  353. )
  354. def test_blacklisted_ip_with_external_ip(self):
  355. """
  356. If a hostname resolves a blacklisted IP, even if there's a
  357. non-blacklisted one, it will be rejected.
  358. """
  359. # Hardcode the URL resolving to the IP we want.
  360. self.lookups["example.com"] = [
  361. (IPv4Address, "1.1.1.2"),
  362. (IPv4Address, "10.1.2.3"),
  363. ]
  364. channel = self.make_request(
  365. "GET", "preview_url?url=http://example.com", shorthand=False
  366. )
  367. self.assertEqual(channel.code, 502)
  368. self.assertEqual(
  369. channel.json_body,
  370. {
  371. "errcode": "M_UNKNOWN",
  372. "error": "DNS resolution failure during URL preview generation",
  373. },
  374. )
  375. def test_blacklisted_ipv6_specific(self):
  376. """
  377. Blacklisted IP addresses, found via DNS, are not spidered.
  378. """
  379. self.lookups["example.com"] = [
  380. (IPv6Address, "3fff:ffff:ffff:ffff:ffff:ffff:ffff:ffff")
  381. ]
  382. channel = self.make_request(
  383. "GET", "preview_url?url=http://example.com", shorthand=False
  384. )
  385. # No requests made.
  386. self.assertEqual(len(self.reactor.tcpClients), 0)
  387. self.assertEqual(channel.code, 502)
  388. self.assertEqual(
  389. channel.json_body,
  390. {
  391. "errcode": "M_UNKNOWN",
  392. "error": "DNS resolution failure during URL preview generation",
  393. },
  394. )
  395. def test_blacklisted_ipv6_range(self):
  396. """
  397. Blacklisted IP ranges, IPs found over DNS, are not spidered.
  398. """
  399. self.lookups["example.com"] = [(IPv6Address, "2001:800::1")]
  400. channel = self.make_request(
  401. "GET", "preview_url?url=http://example.com", shorthand=False
  402. )
  403. self.assertEqual(channel.code, 502)
  404. self.assertEqual(
  405. channel.json_body,
  406. {
  407. "errcode": "M_UNKNOWN",
  408. "error": "DNS resolution failure during URL preview generation",
  409. },
  410. )
  411. def test_OPTIONS(self):
  412. """
  413. OPTIONS returns the OPTIONS.
  414. """
  415. channel = self.make_request(
  416. "OPTIONS", "preview_url?url=http://example.com", shorthand=False
  417. )
  418. self.assertEqual(channel.code, 200)
  419. self.assertEqual(channel.json_body, {})
  420. def test_accept_language_config_option(self):
  421. """
  422. Accept-Language header is sent to the remote server
  423. """
  424. self.lookups["example.com"] = [(IPv4Address, "10.1.2.3")]
  425. # Build and make a request to the server
  426. channel = self.make_request(
  427. "GET",
  428. "preview_url?url=http://example.com",
  429. shorthand=False,
  430. await_result=False,
  431. )
  432. self.pump()
  433. # Extract Synapse's tcp client
  434. client = self.reactor.tcpClients[0][2].buildProtocol(None)
  435. # Build a fake remote server to reply with
  436. server = AccumulatingProtocol()
  437. # Connect the two together
  438. server.makeConnection(FakeTransport(client, self.reactor))
  439. client.makeConnection(FakeTransport(server, self.reactor))
  440. # Tell Synapse that it has received some data from the remote server
  441. client.dataReceived(
  442. b"HTTP/1.0 200 OK\r\nContent-Length: %d\r\nContent-Type: text/html\r\n\r\n"
  443. % (len(self.end_content),)
  444. + self.end_content
  445. )
  446. # Move the reactor along until we get a response on our original channel
  447. self.pump()
  448. self.assertEqual(channel.code, 200)
  449. self.assertEqual(
  450. channel.json_body, {"og:title": "~matrix~", "og:description": "hi"}
  451. )
  452. # Check that the server received the Accept-Language header as part
  453. # of the request from Synapse
  454. self.assertIn(
  455. (
  456. b"Accept-Language: en-UK\r\n"
  457. b"Accept-Language: en-US;q=0.9\r\n"
  458. b"Accept-Language: fr;q=0.8\r\n"
  459. b"Accept-Language: *;q=0.7"
  460. ),
  461. server.data,
  462. )
  463. def test_oembed_photo(self):
  464. """Test an oEmbed endpoint which returns a 'photo' type which redirects the preview to a new URL."""
  465. # Route the HTTP version to an HTTP endpoint so that the tests work.
  466. with patch.dict(
  467. "synapse.rest.media.v1.preview_url_resource._oembed_patterns",
  468. {
  469. re.compile(
  470. r"http://twitter\.com/.+/status/.+"
  471. ): "http://publish.twitter.com/oembed",
  472. },
  473. clear=True,
  474. ):
  475. self.lookups["publish.twitter.com"] = [(IPv4Address, "10.1.2.3")]
  476. self.lookups["cdn.twitter.com"] = [(IPv4Address, "10.1.2.3")]
  477. result = {
  478. "version": "1.0",
  479. "type": "photo",
  480. "url": "http://cdn.twitter.com/matrixdotorg",
  481. }
  482. oembed_content = json.dumps(result).encode("utf-8")
  483. end_content = (
  484. b"<html><head>"
  485. b"<title>Some Title</title>"
  486. b'<meta property="og:description" content="hi" />'
  487. b"</head></html>"
  488. )
  489. channel = self.make_request(
  490. "GET",
  491. "preview_url?url=http://twitter.com/matrixdotorg/status/12345",
  492. shorthand=False,
  493. await_result=False,
  494. )
  495. self.pump()
  496. client = self.reactor.tcpClients[0][2].buildProtocol(None)
  497. server = AccumulatingProtocol()
  498. server.makeConnection(FakeTransport(client, self.reactor))
  499. client.makeConnection(FakeTransport(server, self.reactor))
  500. client.dataReceived(
  501. (
  502. b"HTTP/1.0 200 OK\r\nContent-Length: %d\r\n"
  503. b'Content-Type: application/json; charset="utf8"\r\n\r\n'
  504. )
  505. % (len(oembed_content),)
  506. + oembed_content
  507. )
  508. self.pump()
  509. client = self.reactor.tcpClients[1][2].buildProtocol(None)
  510. server = AccumulatingProtocol()
  511. server.makeConnection(FakeTransport(client, self.reactor))
  512. client.makeConnection(FakeTransport(server, self.reactor))
  513. client.dataReceived(
  514. (
  515. b"HTTP/1.0 200 OK\r\nContent-Length: %d\r\n"
  516. b'Content-Type: text/html; charset="utf8"\r\n\r\n'
  517. )
  518. % (len(end_content),)
  519. + end_content
  520. )
  521. self.pump()
  522. self.assertEqual(channel.code, 200)
  523. self.assertEqual(
  524. channel.json_body, {"og:title": "Some Title", "og:description": "hi"}
  525. )
  526. def test_oembed_rich(self):
  527. """Test an oEmbed endpoint which returns HTML content via the 'rich' type."""
  528. # Route the HTTP version to an HTTP endpoint so that the tests work.
  529. with patch.dict(
  530. "synapse.rest.media.v1.preview_url_resource._oembed_patterns",
  531. {
  532. re.compile(
  533. r"http://twitter\.com/.+/status/.+"
  534. ): "http://publish.twitter.com/oembed",
  535. },
  536. clear=True,
  537. ):
  538. self.lookups["publish.twitter.com"] = [(IPv4Address, "10.1.2.3")]
  539. result = {
  540. "version": "1.0",
  541. "type": "rich",
  542. "html": "<div>Content Preview</div>",
  543. }
  544. end_content = json.dumps(result).encode("utf-8")
  545. channel = self.make_request(
  546. "GET",
  547. "preview_url?url=http://twitter.com/matrixdotorg/status/12345",
  548. shorthand=False,
  549. await_result=False,
  550. )
  551. self.pump()
  552. client = self.reactor.tcpClients[0][2].buildProtocol(None)
  553. server = AccumulatingProtocol()
  554. server.makeConnection(FakeTransport(client, self.reactor))
  555. client.makeConnection(FakeTransport(server, self.reactor))
  556. client.dataReceived(
  557. (
  558. b"HTTP/1.0 200 OK\r\nContent-Length: %d\r\n"
  559. b'Content-Type: application/json; charset="utf8"\r\n\r\n'
  560. )
  561. % (len(end_content),)
  562. + end_content
  563. )
  564. self.pump()
  565. self.assertEqual(channel.code, 200)
  566. self.assertEqual(
  567. channel.json_body,
  568. {"og:title": None, "og:description": "Content Preview"},
  569. )