test_html_preview.py 20 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542
  1. # Copyright 2014-2016 OpenMarket Ltd
  2. #
  3. # Licensed under the Apache License, Version 2.0 (the "License");
  4. # you may not use this file except in compliance with the License.
  5. # You may obtain a copy of the License at
  6. #
  7. # http://www.apache.org/licenses/LICENSE-2.0
  8. #
  9. # Unless required by applicable law or agreed to in writing, software
  10. # distributed under the License is distributed on an "AS IS" BASIS,
  11. # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  12. # See the License for the specific language governing permissions and
  13. # limitations under the License.
  14. from synapse.rest.media.v1.preview_html import (
  15. _get_html_media_encodings,
  16. decode_body,
  17. parse_html_to_open_graph,
  18. summarize_paragraphs,
  19. )
  20. from tests import unittest
  21. try:
  22. import lxml
  23. except ImportError:
  24. lxml = None
  25. class SummarizeTestCase(unittest.TestCase):
  26. if not lxml:
  27. skip = "url preview feature requires lxml"
  28. def test_long_summarize(self) -> None:
  29. example_paras = [
  30. """Tromsø (Norwegian pronunciation: [ˈtrʊmsœ] ( listen); Northern Sami:
  31. Romsa; Finnish: Tromssa[2] Kven: Tromssa) is a city and municipality in
  32. Troms county, Norway. The administrative centre of the municipality is
  33. the city of Tromsø. Outside of Norway, Tromso and Tromsö are
  34. alternative spellings of the city.Tromsø is considered the northernmost
  35. city in the world with a population above 50,000. The most populous town
  36. north of it is Alta, Norway, with a population of 14,272 (2013).""",
  37. """Tromsø lies in Northern Norway. The municipality has a population of
  38. (2015) 72,066, but with an annual influx of students it has over 75,000
  39. most of the year. It is the largest urban area in Northern Norway and the
  40. third largest north of the Arctic Circle (following Murmansk and Norilsk).
  41. Most of Tromsø, including the city centre, is located on the island of
  42. Tromsøya, 350 kilometres (217 mi) north of the Arctic Circle. In 2012,
  43. Tromsøya had a population of 36,088. Substantial parts of the urban area
  44. are also situated on the mainland to the east, and on parts of Kvaløya—a
  45. large island to the west. Tromsøya is connected to the mainland by the Tromsø
  46. Bridge and the Tromsøysund Tunnel, and to the island of Kvaløya by the
  47. Sandnessund Bridge. Tromsø Airport connects the city to many destinations
  48. in Europe. The city is warmer than most other places located on the same
  49. latitude, due to the warming effect of the Gulf Stream.""",
  50. """The city centre of Tromsø contains the highest number of old wooden
  51. houses in Northern Norway, the oldest house dating from 1789. The Arctic
  52. Cathedral, a modern church from 1965, is probably the most famous landmark
  53. in Tromsø. The city is a cultural centre for its region, with several
  54. festivals taking place in the summer. Some of Norway's best-known
  55. musicians, Torbjørn Brundtland and Svein Berge of the electronica duo
  56. Röyksopp and Lene Marlin grew up and started their careers in Tromsø.
  57. Noted electronic musician Geir Jenssen also hails from Tromsø.""",
  58. ]
  59. desc = summarize_paragraphs(example_paras, min_size=200, max_size=500)
  60. self.assertEqual(
  61. desc,
  62. "Tromsø (Norwegian pronunciation: [ˈtrʊmsœ] ( listen); Northern Sami:"
  63. " Romsa; Finnish: Tromssa[2] Kven: Tromssa) is a city and municipality in"
  64. " Troms county, Norway. The administrative centre of the municipality is"
  65. " the city of Tromsø. Outside of Norway, Tromso and Tromsö are"
  66. " alternative spellings of the city.Tromsø is considered the northernmost"
  67. " city in the world with a population above 50,000. The most populous town"
  68. " north of it is Alta, Norway, with a population of 14,272 (2013).",
  69. )
  70. desc = summarize_paragraphs(example_paras[1:], min_size=200, max_size=500)
  71. self.assertEqual(
  72. desc,
  73. "Tromsø lies in Northern Norway. The municipality has a population of"
  74. " (2015) 72,066, but with an annual influx of students it has over 75,000"
  75. " most of the year. It is the largest urban area in Northern Norway and the"
  76. " third largest north of the Arctic Circle (following Murmansk and Norilsk)."
  77. " Most of Tromsø, including the city centre, is located on the island of"
  78. " Tromsøya, 350 kilometres (217 mi) north of the Arctic Circle. In 2012,"
  79. " Tromsøya had a population of 36,088. Substantial parts of the urban…",
  80. )
  81. def test_short_summarize(self) -> None:
  82. example_paras = [
  83. "Tromsø (Norwegian pronunciation: [ˈtrʊmsœ] ( listen); Northern Sami:"
  84. " Romsa; Finnish: Tromssa[2] Kven: Tromssa) is a city and municipality in"
  85. " Troms county, Norway.",
  86. "Tromsø lies in Northern Norway. The municipality has a population of"
  87. " (2015) 72,066, but with an annual influx of students it has over 75,000"
  88. " most of the year.",
  89. "The city centre of Tromsø contains the highest number of old wooden"
  90. " houses in Northern Norway, the oldest house dating from 1789. The Arctic"
  91. " Cathedral, a modern church from 1965, is probably the most famous landmark"
  92. " in Tromsø.",
  93. ]
  94. desc = summarize_paragraphs(example_paras, min_size=200, max_size=500)
  95. self.assertEqual(
  96. desc,
  97. "Tromsø (Norwegian pronunciation: [ˈtrʊmsœ] ( listen); Northern Sami:"
  98. " Romsa; Finnish: Tromssa[2] Kven: Tromssa) is a city and municipality in"
  99. " Troms county, Norway.\n"
  100. "\n"
  101. "Tromsø lies in Northern Norway. The municipality has a population of"
  102. " (2015) 72,066, but with an annual influx of students it has over 75,000"
  103. " most of the year.",
  104. )
  105. def test_small_then_large_summarize(self) -> None:
  106. example_paras = [
  107. "Tromsø (Norwegian pronunciation: [ˈtrʊmsœ] ( listen); Northern Sami:"
  108. " Romsa; Finnish: Tromssa[2] Kven: Tromssa) is a city and municipality in"
  109. " Troms county, Norway.",
  110. "Tromsø lies in Northern Norway. The municipality has a population of"
  111. " (2015) 72,066, but with an annual influx of students it has over 75,000"
  112. " most of the year."
  113. " The city centre of Tromsø contains the highest number of old wooden"
  114. " houses in Northern Norway, the oldest house dating from 1789. The Arctic"
  115. " Cathedral, a modern church from 1965, is probably the most famous landmark"
  116. " in Tromsø.",
  117. ]
  118. desc = summarize_paragraphs(example_paras, min_size=200, max_size=500)
  119. self.assertEqual(
  120. desc,
  121. "Tromsø (Norwegian pronunciation: [ˈtrʊmsœ] ( listen); Northern Sami:"
  122. " Romsa; Finnish: Tromssa[2] Kven: Tromssa) is a city and municipality in"
  123. " Troms county, Norway.\n"
  124. "\n"
  125. "Tromsø lies in Northern Norway. The municipality has a population of"
  126. " (2015) 72,066, but with an annual influx of students it has over 75,000"
  127. " most of the year. The city centre of Tromsø contains the highest number"
  128. " of old wooden houses in Northern Norway, the oldest house dating from"
  129. " 1789. The Arctic Cathedral, a modern church from…",
  130. )
  131. class OpenGraphFromHtmlTestCase(unittest.TestCase):
  132. if not lxml:
  133. skip = "url preview feature requires lxml"
  134. def test_simple(self) -> None:
  135. html = b"""
  136. <html>
  137. <head><title>Foo</title></head>
  138. <body>
  139. Some text.
  140. </body>
  141. </html>
  142. """
  143. tree = decode_body(html, "http://example.com/test.html")
  144. og = parse_html_to_open_graph(tree)
  145. self.assertEqual(og, {"og:title": "Foo", "og:description": "Some text."})
  146. def test_comment(self) -> None:
  147. html = b"""
  148. <html>
  149. <head><title>Foo</title></head>
  150. <body>
  151. <!-- HTML comment -->
  152. Some text.
  153. </body>
  154. </html>
  155. """
  156. tree = decode_body(html, "http://example.com/test.html")
  157. og = parse_html_to_open_graph(tree)
  158. self.assertEqual(og, {"og:title": "Foo", "og:description": "Some text."})
  159. def test_comment2(self) -> None:
  160. html = b"""
  161. <html>
  162. <head><title>Foo</title></head>
  163. <body>
  164. Some text.
  165. <!-- HTML comment -->
  166. Some more text.
  167. <p>Text</p>
  168. More text
  169. </body>
  170. </html>
  171. """
  172. tree = decode_body(html, "http://example.com/test.html")
  173. og = parse_html_to_open_graph(tree)
  174. self.assertEqual(
  175. og,
  176. {
  177. "og:title": "Foo",
  178. "og:description": "Some text.\n\nSome more text.\n\nText\n\nMore text",
  179. },
  180. )
  181. def test_script(self) -> None:
  182. html = b"""
  183. <html>
  184. <head><title>Foo</title></head>
  185. <body>
  186. <script> (function() {})() </script>
  187. Some text.
  188. </body>
  189. </html>
  190. """
  191. tree = decode_body(html, "http://example.com/test.html")
  192. og = parse_html_to_open_graph(tree)
  193. self.assertEqual(og, {"og:title": "Foo", "og:description": "Some text."})
  194. def test_missing_title(self) -> None:
  195. html = b"""
  196. <html>
  197. <body>
  198. Some text.
  199. </body>
  200. </html>
  201. """
  202. tree = decode_body(html, "http://example.com/test.html")
  203. og = parse_html_to_open_graph(tree)
  204. self.assertEqual(og, {"og:title": None, "og:description": "Some text."})
  205. # Another variant is a title with no content.
  206. html = b"""
  207. <html>
  208. <head><title></title></head>
  209. <body>
  210. <h1>Title</h1>
  211. </body>
  212. </html>
  213. """
  214. tree = decode_body(html, "http://example.com/test.html")
  215. og = parse_html_to_open_graph(tree)
  216. self.assertEqual(og, {"og:title": "Title", "og:description": "Title"})
  217. def test_h1_as_title(self) -> None:
  218. html = b"""
  219. <html>
  220. <meta property="og:description" content="Some text."/>
  221. <body>
  222. <h1>Title</h1>
  223. </body>
  224. </html>
  225. """
  226. tree = decode_body(html, "http://example.com/test.html")
  227. og = parse_html_to_open_graph(tree)
  228. self.assertEqual(og, {"og:title": "Title", "og:description": "Some text."})
  229. def test_empty_description(self) -> None:
  230. """Description tags with empty content should be ignored."""
  231. html = b"""
  232. <html>
  233. <meta property="og:description" content=""/>
  234. <meta property="og:description"/>
  235. <meta name="description" content=""/>
  236. <meta name="description"/>
  237. <meta name="description" content="Finally!"/>
  238. <body>
  239. <h1>Title</h1>
  240. </body>
  241. </html>
  242. """
  243. tree = decode_body(html, "http://example.com/test.html")
  244. og = parse_html_to_open_graph(tree)
  245. self.assertEqual(og, {"og:title": "Title", "og:description": "Finally!"})
  246. def test_missing_title_and_broken_h1(self) -> None:
  247. html = b"""
  248. <html>
  249. <body>
  250. <h1><a href="foo"/></h1>
  251. Some text.
  252. </body>
  253. </html>
  254. """
  255. tree = decode_body(html, "http://example.com/test.html")
  256. og = parse_html_to_open_graph(tree)
  257. self.assertEqual(og, {"og:title": None, "og:description": "Some text."})
  258. def test_empty(self) -> None:
  259. """Test a body with no data in it."""
  260. html = b""
  261. tree = decode_body(html, "http://example.com/test.html")
  262. self.assertIsNone(tree)
  263. def test_no_tree(self) -> None:
  264. """A valid body with no tree in it."""
  265. html = b"\x00"
  266. tree = decode_body(html, "http://example.com/test.html")
  267. self.assertIsNone(tree)
  268. def test_xml(self) -> None:
  269. """Test decoding XML and ensure it works properly."""
  270. # Note that the strip() call is important to ensure the xml tag starts
  271. # at the initial byte.
  272. html = b"""
  273. <?xml version="1.0" encoding="UTF-8"?>
  274. <!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
  275. <html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en" lang="en">
  276. <head><title>Foo</title></head><body>Some text.</body></html>
  277. """.strip()
  278. tree = decode_body(html, "http://example.com/test.html")
  279. og = parse_html_to_open_graph(tree)
  280. self.assertEqual(og, {"og:title": "Foo", "og:description": "Some text."})
  281. def test_invalid_encoding(self) -> None:
  282. """An invalid character encoding should be ignored and treated as UTF-8, if possible."""
  283. html = b"""
  284. <html>
  285. <head><title>Foo</title></head>
  286. <body>
  287. Some text.
  288. </body>
  289. </html>
  290. """
  291. tree = decode_body(html, "http://example.com/test.html", "invalid-encoding")
  292. og = parse_html_to_open_graph(tree)
  293. self.assertEqual(og, {"og:title": "Foo", "og:description": "Some text."})
  294. def test_invalid_encoding2(self) -> None:
  295. """A body which doesn't match the sent character encoding."""
  296. # Note that this contains an invalid UTF-8 sequence in the title.
  297. html = b"""
  298. <html>
  299. <head><title>\xff\xff Foo</title></head>
  300. <body>
  301. Some text.
  302. </body>
  303. </html>
  304. """
  305. tree = decode_body(html, "http://example.com/test.html")
  306. og = parse_html_to_open_graph(tree)
  307. self.assertEqual(og, {"og:title": "ÿÿ Foo", "og:description": "Some text."})
  308. def test_windows_1252(self) -> None:
  309. """A body which uses cp1252, but doesn't declare that."""
  310. html = b"""
  311. <html>
  312. <head><title>\xf3</title></head>
  313. <body>
  314. Some text.
  315. </body>
  316. </html>
  317. """
  318. tree = decode_body(html, "http://example.com/test.html")
  319. og = parse_html_to_open_graph(tree)
  320. self.assertEqual(og, {"og:title": "ó", "og:description": "Some text."})
  321. def test_twitter_tag(self) -> None:
  322. """Twitter card tags should be used if nothing else is available."""
  323. html = b"""
  324. <html>
  325. <meta name="twitter:card" content="summary">
  326. <meta name="twitter:description" content="Description">
  327. <meta name="twitter:site" content="@matrixdotorg">
  328. </html>
  329. """
  330. tree = decode_body(html, "http://example.com/test.html")
  331. og = parse_html_to_open_graph(tree)
  332. self.assertEqual(
  333. og,
  334. {
  335. "og:title": None,
  336. "og:description": "Description",
  337. "og:site_name": "@matrixdotorg",
  338. },
  339. )
  340. # But they shouldn't override Open Graph values.
  341. html = b"""
  342. <html>
  343. <meta name="twitter:card" content="summary">
  344. <meta name="twitter:description" content="Description">
  345. <meta property="og:description" content="Real Description">
  346. <meta name="twitter:site" content="@matrixdotorg">
  347. <meta property="og:site_name" content="matrix.org">
  348. </html>
  349. """
  350. tree = decode_body(html, "http://example.com/test.html")
  351. og = parse_html_to_open_graph(tree)
  352. self.assertEqual(
  353. og,
  354. {
  355. "og:title": None,
  356. "og:description": "Real Description",
  357. "og:site_name": "matrix.org",
  358. },
  359. )
  360. def test_nested_nodes(self) -> None:
  361. """A body with some nested nodes. Tests that we iterate over children
  362. in the right order (and don't reverse the order of the text)."""
  363. html = b"""
  364. <a href="somewhere">Welcome <b>the bold <u>and underlined text <svg>
  365. with a cheeky SVG</svg></u> and <strong>some</strong> tail text</b></a>
  366. """
  367. tree = decode_body(html, "http://example.com/test.html")
  368. og = parse_html_to_open_graph(tree)
  369. self.assertEqual(
  370. og,
  371. {
  372. "og:title": None,
  373. "og:description": "Welcome\n\nthe bold\n\nand underlined text\n\nand\n\nsome\n\ntail text",
  374. },
  375. )
  376. class MediaEncodingTestCase(unittest.TestCase):
  377. def test_meta_charset(self) -> None:
  378. """A character encoding is found via the meta tag."""
  379. encodings = _get_html_media_encodings(
  380. b"""
  381. <html>
  382. <head><meta charset="ascii">
  383. </head>
  384. </html>
  385. """,
  386. "text/html",
  387. )
  388. self.assertEqual(list(encodings), ["ascii", "utf-8", "cp1252"])
  389. # A less well-formed version.
  390. encodings = _get_html_media_encodings(
  391. b"""
  392. <html>
  393. <head>< meta charset = ascii>
  394. </head>
  395. </html>
  396. """,
  397. "text/html",
  398. )
  399. self.assertEqual(list(encodings), ["ascii", "utf-8", "cp1252"])
  400. def test_meta_charset_underscores(self) -> None:
  401. """A character encoding contains underscore."""
  402. encodings = _get_html_media_encodings(
  403. b"""
  404. <html>
  405. <head><meta charset="Shift_JIS">
  406. </head>
  407. </html>
  408. """,
  409. "text/html",
  410. )
  411. self.assertEqual(list(encodings), ["shift_jis", "utf-8", "cp1252"])
  412. def test_xml_encoding(self) -> None:
  413. """A character encoding is found via the meta tag."""
  414. encodings = _get_html_media_encodings(
  415. b"""
  416. <?xml version="1.0" encoding="ascii"?>
  417. <html>
  418. </html>
  419. """,
  420. "text/html",
  421. )
  422. self.assertEqual(list(encodings), ["ascii", "utf-8", "cp1252"])
  423. def test_meta_xml_encoding(self) -> None:
  424. """Meta tags take precedence over XML encoding."""
  425. encodings = _get_html_media_encodings(
  426. b"""
  427. <?xml version="1.0" encoding="ascii"?>
  428. <html>
  429. <head><meta charset="UTF-16">
  430. </head>
  431. </html>
  432. """,
  433. "text/html",
  434. )
  435. self.assertEqual(list(encodings), ["utf-16", "ascii", "utf-8", "cp1252"])
  436. def test_content_type(self) -> None:
  437. """A character encoding is found via the Content-Type header."""
  438. # Test a few variations of the header.
  439. headers = (
  440. 'text/html; charset="ascii";',
  441. "text/html;charset=ascii;",
  442. 'text/html; charset="ascii"',
  443. "text/html; charset=ascii",
  444. 'text/html; charset="ascii;',
  445. 'text/html; charset=ascii";',
  446. )
  447. for header in headers:
  448. encodings = _get_html_media_encodings(b"", header)
  449. self.assertEqual(list(encodings), ["ascii", "utf-8", "cp1252"])
  450. def test_fallback(self) -> None:
  451. """A character encoding cannot be found in the body or header."""
  452. encodings = _get_html_media_encodings(b"", "text/html")
  453. self.assertEqual(list(encodings), ["utf-8", "cp1252"])
  454. def test_duplicates(self) -> None:
  455. """Ensure each encoding is only attempted once."""
  456. encodings = _get_html_media_encodings(
  457. b"""
  458. <?xml version="1.0" encoding="utf8"?>
  459. <html>
  460. <head><meta charset="UTF-8">
  461. </head>
  462. </html>
  463. """,
  464. 'text/html; charset="UTF_8"',
  465. )
  466. self.assertEqual(list(encodings), ["utf-8", "cp1252"])
  467. def test_unknown_invalid(self) -> None:
  468. """A character encoding should be ignored if it is unknown or invalid."""
  469. encodings = _get_html_media_encodings(
  470. b"""
  471. <html>
  472. <head><meta charset="invalid">
  473. </head>
  474. </html>
  475. """,
  476. 'text/html; charset="invalid"',
  477. )
  478. self.assertEqual(list(encodings), ["utf-8", "cp1252"])