link_details_extractor_spec.rb 8.2 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302
  1. # frozen_string_literal: true
  2. require 'rails_helper'
  3. RSpec.describe LinkDetailsExtractor do
  4. subject { described_class.new(original_url, html, nil) }
  5. let(:original_url) { 'https://example.com/dog.html?tracking=123' }
  6. describe '#canonical_url' do
  7. let(:html) { "<!doctype html><link rel='canonical' href='#{url}'>" }
  8. context 'when canonical URL points to the same host' do
  9. let(:url) { 'https://example.com/dog.html' }
  10. it 'ignores the canonical URLs' do
  11. expect(subject.canonical_url).to eq 'https://example.com/dog.html'
  12. end
  13. end
  14. context 'when canonical URL points to another host' do
  15. let(:url) { 'https://different.example.net/dog.html' }
  16. it 'ignores the canonical URLs' do
  17. expect(subject.canonical_url).to eq original_url
  18. end
  19. end
  20. context 'when canonical URL is set to "null"' do
  21. let(:url) { 'null' }
  22. it 'ignores the canonical URLs' do
  23. expect(subject.canonical_url).to eq original_url
  24. end
  25. end
  26. end
  27. context 'when only basic metadata is present' do
  28. let(:html) { <<~HTML }
  29. <!doctype html>
  30. <html lang="en">
  31. <head>
  32. <title>Man bites dog</title>
  33. <meta name="description" content="A dog&#39;s tale">
  34. </head>
  35. </html>
  36. HTML
  37. describe '#title' do
  38. it 'returns the title from title tag' do
  39. expect(subject.title).to eq 'Man bites dog'
  40. end
  41. end
  42. describe '#description' do
  43. it 'returns the description from meta tag' do
  44. expect(subject.description).to eq "A dog's tale"
  45. end
  46. end
  47. describe '#language' do
  48. it 'returns the language from lang attribute' do
  49. expect(subject.language).to eq 'en'
  50. end
  51. end
  52. end
  53. context 'when structured data is present' do
  54. let(:ld_json) do
  55. {
  56. '@context' => 'https://schema.org',
  57. '@type' => 'NewsArticle',
  58. 'headline' => 'Man bites dog',
  59. 'description' => "A dog's tale",
  60. 'datePublished' => '2022-01-31T19:53:00+00:00',
  61. 'author' => {
  62. '@type' => 'Organization',
  63. 'name' => 'Charlie Brown',
  64. },
  65. 'publisher' => {
  66. '@type' => 'NewsMediaOrganization',
  67. 'name' => 'Pet News',
  68. 'url' => 'https://example.com',
  69. },
  70. 'inLanguage' => {
  71. name: 'English',
  72. alternateName: 'en',
  73. },
  74. }.to_json
  75. end
  76. shared_examples 'structured data' do
  77. describe '#title' do
  78. it 'returns the title from structured data' do
  79. expect(subject.title).to eq 'Man bites dog'
  80. end
  81. end
  82. describe '#description' do
  83. it 'returns the description from structured data' do
  84. expect(subject.description).to eq "A dog's tale"
  85. end
  86. end
  87. describe '#published_at' do
  88. it 'returns the publicaton time from structured data' do
  89. expect(subject.published_at).to eq '2022-01-31T19:53:00+00:00'
  90. end
  91. end
  92. describe '#author_name' do
  93. it 'returns the author name from structured data' do
  94. expect(subject.author_name).to eq 'Charlie Brown'
  95. end
  96. end
  97. describe '#provider_name' do
  98. it 'returns the provider name from structured data' do
  99. expect(subject.provider_name).to eq 'Pet News'
  100. end
  101. end
  102. describe '#language' do
  103. it 'returns the language from structured data' do
  104. expect(subject.language).to eq 'en'
  105. end
  106. end
  107. end
  108. context 'when is wrapped in CDATA tags' do
  109. let(:html) { <<~HTML }
  110. <!doctype html>
  111. <html>
  112. <head>
  113. <script type="application/ld+json">
  114. //<![CDATA[
  115. #{ld_json}
  116. //]]>
  117. </script>
  118. </head>
  119. </html>
  120. HTML
  121. include_examples 'structured data'
  122. end
  123. context 'with the first tag is invalid JSON' do
  124. let(:html) { <<~HTML }
  125. <!doctype html>
  126. <html>
  127. <body>
  128. <script type="application/ld+json">
  129. invalid LD+JSON
  130. </script>
  131. <script type="application/ld+json">
  132. #{ld_json}
  133. </script>
  134. </body>
  135. </html>
  136. HTML
  137. include_examples 'structured data'
  138. end
  139. context 'with preceding block of unsupported LD+JSON' do
  140. let(:html) { <<~HTML }
  141. <!doctype html>
  142. <html>
  143. <body>
  144. <script type="application/ld+json">
  145. [
  146. {
  147. "@context": "https://schema.org",
  148. "@type": "ItemList",
  149. "url": "https://example.com/cat.html",
  150. "name": "Man bites cat",
  151. "description": "A cat's tale"
  152. },
  153. {
  154. "@context": "https://schema.org",
  155. "@type": "BreadcrumbList",
  156. "itemListElement":[
  157. {
  158. "@type": "ListItem",
  159. "position": 1,
  160. "item": {
  161. "@id": "https://www.example.com",
  162. "name": "Cat News"
  163. }
  164. }
  165. ]
  166. }
  167. ]
  168. </script>
  169. <script type="application/ld+json">
  170. #{ld_json}
  171. </script>
  172. </body>
  173. </html>
  174. HTML
  175. include_examples 'structured data'
  176. end
  177. context 'with unsupported in same block LD+JSON' do
  178. let(:html) { <<~HTML }
  179. <!doctype html>
  180. <html>
  181. <body>
  182. <script type="application/ld+json">
  183. [
  184. {
  185. "@context": "https://schema.org",
  186. "@type": "ItemList",
  187. "url": "https://example.com/cat.html",
  188. "name": "Man bites cat",
  189. "description": "A cat's tale"
  190. },
  191. #{ld_json}
  192. ]
  193. </script>
  194. </body>
  195. </html>
  196. HTML
  197. include_examples 'structured data'
  198. end
  199. end
  200. context 'when Open Graph protocol data is present' do
  201. let(:html) { <<~HTML }
  202. <!doctype html>
  203. <html>
  204. <head>
  205. <meta property="og:url" content="https://example.com/dog.html">
  206. <meta property="og:title" content="Man bites dog">
  207. <meta property="og:description" content="A dog's tale">
  208. <meta property="article:published_time" content="2022-01-31T19:53:00+00:00">
  209. <meta property="og:author" content="Charlie Brown">
  210. <meta property="og:locale" content="en">
  211. <meta property="og:image" content="https://example.com/snoopy.jpg">
  212. <meta property="og:image:alt" content="A good boy">
  213. <meta property="og:site_name" content="Pet News">
  214. </head>
  215. </html>
  216. HTML
  217. describe '#canonical_url' do
  218. it 'returns the URL from Open Graph protocol data' do
  219. expect(subject.canonical_url).to eq 'https://example.com/dog.html'
  220. end
  221. end
  222. describe '#title' do
  223. it 'returns the title from Open Graph protocol data' do
  224. expect(subject.title).to eq 'Man bites dog'
  225. end
  226. end
  227. describe '#description' do
  228. it 'returns the description from Open Graph protocol data' do
  229. expect(subject.description).to eq "A dog's tale"
  230. end
  231. end
  232. describe '#published_at' do
  233. it 'returns the publicaton time from Open Graph protocol data' do
  234. expect(subject.published_at).to eq '2022-01-31T19:53:00+00:00'
  235. end
  236. end
  237. describe '#author_name' do
  238. it 'returns the author name from Open Graph protocol data' do
  239. expect(subject.author_name).to eq 'Charlie Brown'
  240. end
  241. end
  242. describe '#language' do
  243. it 'returns the language from Open Graph protocol data' do
  244. expect(subject.language).to eq 'en'
  245. end
  246. end
  247. describe '#image' do
  248. it 'returns the image from Open Graph protocol data' do
  249. expect(subject.image).to eq 'https://example.com/snoopy.jpg'
  250. end
  251. end
  252. describe '#image:alt' do
  253. it 'returns the image description from Open Graph protocol data' do
  254. expect(subject.image_alt).to eq 'A good boy'
  255. end
  256. end
  257. describe '#provider_name' do
  258. it 'returns the provider name from Open Graph protocol data' do
  259. expect(subject.provider_name).to eq 'Pet News'
  260. end
  261. end
  262. end
  263. end