1
0

link_details_extractor_spec.rb 7.7 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285
  1. # frozen_string_literal: true
  2. require 'rails_helper'
  3. RSpec.describe LinkDetailsExtractor do
  4. subject { described_class.new(original_url, html, nil) }
  5. let(:original_url) { 'https://example.com/dog.html?tracking=123' }
  6. describe '#canonical_url' do
  7. let(:html) { "<!doctype html><link rel='canonical' href='#{url}'>" }
  8. context 'when canonical URL points to the same host' do
  9. let(:url) { 'https://example.com/dog.html' }
  10. it 'ignores the canonical URLs' do
  11. expect(subject.canonical_url).to eq 'https://example.com/dog.html'
  12. end
  13. end
  14. context 'when canonical URL points to another host' do
  15. let(:url) { 'https://different.example.net/dog.html' }
  16. it 'ignores the canonical URLs' do
  17. expect(subject.canonical_url).to eq original_url
  18. end
  19. end
  20. context 'when canonical URL is set to "null"' do
  21. let(:url) { 'null' }
  22. it 'ignores the canonical URLs' do
  23. expect(subject.canonical_url).to eq original_url
  24. end
  25. end
  26. context 'when canonical URL is set to "undefined"' do
  27. let(:url) { 'undefined' }
  28. it 'ignores the canonical URLs' do
  29. expect(subject.canonical_url).to eq original_url
  30. end
  31. end
  32. end
  33. context 'when only basic metadata is present' do
  34. let(:html) { <<~HTML }
  35. <!doctype html>
  36. <html lang="en">
  37. <head>
  38. <title>Man bites dog</title>
  39. <meta name="description" content="A dog&#39;s tale">
  40. </head>
  41. </html>
  42. HTML
  43. it 'extracts the expected values from html metadata' do
  44. expect(subject)
  45. .to have_attributes(
  46. title: eq('Man bites dog'),
  47. description: eq("A dog's tale"),
  48. language: eq('en')
  49. )
  50. end
  51. end
  52. context 'when structured data is present' do
  53. let(:ld_json) do
  54. {
  55. '@context' => 'https://schema.org',
  56. '@type' => 'NewsArticle',
  57. 'headline' => 'Man bites dog',
  58. 'description' => "A dog's tale",
  59. 'datePublished' => '2022-01-31T19:53:00+00:00',
  60. 'author' => {
  61. '@type' => 'Organization',
  62. 'name' => 'Charlie Brown',
  63. },
  64. 'publisher' => {
  65. '@type' => 'NewsMediaOrganization',
  66. 'name' => 'Pet News',
  67. 'url' => 'https://example.com',
  68. },
  69. 'inLanguage' => {
  70. name: 'English',
  71. alternateName: 'en',
  72. },
  73. }.to_json
  74. end
  75. shared_examples 'structured data' do
  76. it 'extracts the expected values from structured data' do
  77. expect(subject)
  78. .to have_attributes(
  79. title: eq('Man bites dog'),
  80. description: eq("A dog's tale"),
  81. published_at: eq('2022-01-31T19:53:00+00:00'),
  82. author_name: eq('Charlie Brown'),
  83. provider_name: eq('Pet News'),
  84. language: eq('en')
  85. )
  86. end
  87. end
  88. context 'when is wrapped in CDATA tags' do
  89. let(:html) { <<~HTML }
  90. <!doctype html>
  91. <html>
  92. <head>
  93. <script type="application/ld+json">
  94. //<![CDATA[
  95. #{ld_json}
  96. //]]>
  97. </script>
  98. </head>
  99. </html>
  100. HTML
  101. include_examples 'structured data'
  102. end
  103. context 'with the first tag is invalid JSON' do
  104. let(:html) { <<~HTML }
  105. <!doctype html>
  106. <html>
  107. <body>
  108. <script type="application/ld+json">
  109. invalid LD+JSON
  110. </script>
  111. <script type="application/ld+json">
  112. #{ld_json}
  113. </script>
  114. </body>
  115. </html>
  116. HTML
  117. include_examples 'structured data'
  118. end
  119. context 'with the first tag is null' do
  120. let(:html) { <<~HTML }
  121. <!doctype html>
  122. <html>
  123. <body>
  124. <script type="application/ld+json">
  125. null
  126. </script>
  127. <script type="application/ld+json">
  128. #{ld_json}
  129. </script>
  130. </body>
  131. </html>
  132. HTML
  133. include_examples 'structured data'
  134. end
  135. context 'with preceding block of unsupported LD+JSON' do
  136. let(:html) { <<~HTML }
  137. <!doctype html>
  138. <html>
  139. <body>
  140. <script type="application/ld+json">
  141. [
  142. {
  143. "@context": "https://schema.org",
  144. "@type": "ItemList",
  145. "url": "https://example.com/cat.html",
  146. "name": "Man bites cat",
  147. "description": "A cat's tale"
  148. },
  149. {
  150. "@context": "https://schema.org",
  151. "@type": "BreadcrumbList",
  152. "itemListElement":[
  153. {
  154. "@type": "ListItem",
  155. "position": 1,
  156. "item": {
  157. "@id": "https://www.example.com",
  158. "name": "Cat News"
  159. }
  160. }
  161. ]
  162. }
  163. ]
  164. </script>
  165. <script type="application/ld+json">
  166. #{ld_json}
  167. </script>
  168. </body>
  169. </html>
  170. HTML
  171. include_examples 'structured data'
  172. end
  173. context 'with unsupported in same block LD+JSON' do
  174. let(:html) { <<~HTML }
  175. <!doctype html>
  176. <html>
  177. <body>
  178. <script type="application/ld+json">
  179. [
  180. {
  181. "@context": "https://schema.org",
  182. "@type": "ItemList",
  183. "url": "https://example.com/cat.html",
  184. "name": "Man bites cat",
  185. "description": "A cat's tale"
  186. },
  187. #{ld_json}
  188. ]
  189. </script>
  190. </body>
  191. </html>
  192. HTML
  193. include_examples 'structured data'
  194. end
  195. context 'with author names as array' do
  196. let(:ld_json) do
  197. {
  198. '@context' => 'https://schema.org',
  199. '@type' => 'NewsArticle',
  200. 'headline' => 'A lot of authors',
  201. 'description' => 'But we decided to cram them into one',
  202. 'author' => {
  203. '@type' => 'Person',
  204. 'name' => ['Author 1', 'Author 2'],
  205. },
  206. }.to_json
  207. end
  208. let(:html) { <<~HTML }
  209. <!doctype html>
  210. <html>
  211. <body>
  212. <script type="application/ld+json">
  213. #{ld_json}
  214. </script>
  215. </body>
  216. </html>
  217. HTML
  218. it 'joins author names' do
  219. expect(subject.author_name).to eq 'Author 1, Author 2'
  220. end
  221. end
  222. end
  223. context 'when Open Graph protocol data is present' do
  224. let(:html) { <<~HTML }
  225. <!doctype html>
  226. <html>
  227. <head>
  228. <meta property="og:url" content="https://example.com/dog.html">
  229. <meta property="og:title" content="Man bites dog">
  230. <meta property="og:description" content="A dog's tale">
  231. <meta property="article:published_time" content="2022-01-31T19:53:00+00:00">
  232. <meta property="og:author" content="Charlie Brown">
  233. <meta property="og:locale" content="en">
  234. <meta property="og:image" content="https://example.com/snoopy.jpg">
  235. <meta property="og:image:alt" content="A good boy">
  236. <meta property="og:site_name" content="Pet News">
  237. </head>
  238. </html>
  239. HTML
  240. it 'extracts the expected values from open graph data' do
  241. expect(subject)
  242. .to have_attributes(
  243. canonical_url: eq('https://example.com/dog.html'),
  244. title: eq('Man bites dog'),
  245. description: eq("A dog's tale"),
  246. published_at: eq('2022-01-31T19:53:00+00:00'),
  247. author_name: eq('Charlie Brown'),
  248. language: eq('en'),
  249. image: eq('https://example.com/snoopy.jpg'),
  250. image_alt: eq('A good boy'),
  251. provider_name: eq('Pet News')
  252. )
  253. end
  254. end
  255. end