Browse Source

Unescape HTML entities in oEmbed titles. (#14781)

It doesn't seem valid that HTML entities should appear in
the title field of oEmbed responses, but a popular WordPress
plug-in seems to do it.

There should not be harm in unescaping these.
Jeyachandran Rathnam 1 year ago
parent
commit
babeeb4e7a
3 changed files with 20 additions and 6 deletions
  1. 1 0
      changelog.d/14781.misc
  2. 9 6
      synapse/rest/media/v1/oembed.py
  3. 10 0
      tests/rest/media/v1/test_oembed.py

+ 1 - 0
changelog.d/14781.misc

@@ -0,0 +1 @@
+Unescape HTML entities in URL preview titles making use of oEmbed responses.

+ 9 - 6
synapse/rest/media/v1/oembed.py

@@ -11,6 +11,7 @@
 #  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 #  See the License for the specific language governing permissions and
 #  limitations under the License.
+import html
 import logging
 import urllib.parse
 from typing import TYPE_CHECKING, List, Optional
@@ -161,7 +162,9 @@ class OEmbedProvider:
 
         title = oembed.get("title")
         if title and isinstance(title, str):
-            open_graph_response["og:title"] = title
+            # A common WordPress plug-in seems to incorrectly escape entities
+            # in the oEmbed response.
+            open_graph_response["og:title"] = html.unescape(title)
 
         author_name = oembed.get("author_name")
         if not isinstance(author_name, str):
@@ -180,9 +183,9 @@ class OEmbedProvider:
         # Process each type separately.
         oembed_type = oembed.get("type")
         if oembed_type == "rich":
-            html = oembed.get("html")
-            if isinstance(html, str):
-                calc_description_and_urls(open_graph_response, html)
+            html_str = oembed.get("html")
+            if isinstance(html_str, str):
+                calc_description_and_urls(open_graph_response, html_str)
 
         elif oembed_type == "photo":
             # If this is a photo, use the full image, not the thumbnail.
@@ -192,8 +195,8 @@ class OEmbedProvider:
 
         elif oembed_type == "video":
             open_graph_response["og:type"] = "video.other"
-            html = oembed.get("html")
-            if html and isinstance(html, str):
+            html_str = oembed.get("html")
+            if html_str and isinstance(html_str, str):
                 calc_description_and_urls(open_graph_response, oembed["html"])
             for size in ("width", "height"):
                 val = oembed.get(size)

+ 10 - 0
tests/rest/media/v1/test_oembed.py

@@ -150,3 +150,13 @@ class OEmbedTests(HomeserverTestCase):
         result = self.parse_response({"type": "link"})
         self.assertIn("og:type", result.open_graph_result)
         self.assertEqual(result.open_graph_result["og:type"], "website")
+
+    def test_title_html_entities(self) -> None:
+        """Test HTML entities in title"""
+        result = self.parse_response(
+            {"title": "Why JSON isn’t a Good Configuration Language"}
+        )
+        self.assertEqual(
+            result.open_graph_result["og:title"],
+            "Why JSON isn’t a Good Configuration Language",
+        )