link_details_extractor_spec.rb 4.7 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161
  1. # frozen_string_literal: true
  2. require 'rails_helper'
  3. RSpec.describe LinkDetailsExtractor do
  4. subject { described_class.new(original_url, html, html_charset) }
  5. let(:original_url) { '' }
  6. let(:html) { '' }
  7. let(:html_charset) { nil }
  8. describe '#canonical_url' do
  9. let(:original_url) { 'https://foo.com/article?bar=baz123' }
  10. context 'when canonical URL points to another host' do
  11. let(:html) { '<!doctype html><link rel="canonical" href="https://bar.com/different-article" />' }
  12. it 'ignores the canonical URLs' do
  13. expect(subject.canonical_url).to eq original_url
  14. end
  15. end
  16. context 'when canonical URL points to the same host' do
  17. let(:html) { '<!doctype html><link rel="canonical" href="https://foo.com/article" />' }
  18. it 'ignores the canonical URLs' do
  19. expect(subject.canonical_url).to eq 'https://foo.com/article'
  20. end
  21. end
  22. context 'when canonical URL is set to "null"' do
  23. let(:html) { '<!doctype html><link rel="canonical" href="null" />' }
  24. it 'ignores the canonical URLs' do
  25. expect(subject.canonical_url).to eq original_url
  26. end
  27. end
  28. end
  29. context 'when structured data is present' do
  30. let(:original_url) { 'https://example.com/page.html' }
  31. context 'when is wrapped in CDATA tags' do
  32. let(:html) { <<~HTML }
  33. <!doctype html>
  34. <html>
  35. <head>
  36. <script type="application/ld+json">
  37. //<![CDATA[
  38. {"@context":"http://schema.org","@type":"NewsArticle","mainEntityOfPage":"https://example.com/page.html","headline":"Foo","datePublished":"2022-01-31T19:53:00+00:00","url":"https://example.com/page.html","description":"Bar","author":{"@type":"Person","name":"Hoge"},"publisher":{"@type":"Organization","name":"Baz"}}
  39. //]]>
  40. </script>
  41. </head>
  42. </html>
  43. HTML
  44. describe '#title' do
  45. it 'returns the title from structured data' do
  46. expect(subject.title).to eq 'Foo'
  47. end
  48. end
  49. describe '#description' do
  50. it 'returns the description from structured data' do
  51. expect(subject.description).to eq 'Bar'
  52. end
  53. end
  54. describe '#provider_name' do
  55. it 'returns the provider name from structured data' do
  56. expect(subject.provider_name).to eq 'Baz'
  57. end
  58. end
  59. describe '#author_name' do
  60. it 'returns the author name from structured data' do
  61. expect(subject.author_name).to eq 'Hoge'
  62. end
  63. end
  64. end
  65. context 'with the first tag is invalid JSON' do
  66. let(:html) { <<~HTML }
  67. <!doctype html>
  68. <html>
  69. <body>
  70. <script type="application/ld+json">
  71. {
  72. "@context":"https://schema.org",
  73. "@type":"ItemList",
  74. "url":"https://example.com/page.html",
  75. "name":"Foo",
  76. "description":"Bar"
  77. },
  78. {
  79. "@context": "https://schema.org",
  80. "@type": "BreadcrumbList",
  81. "itemListElement":[
  82. {
  83. "@type":"ListItem",
  84. "position":1,
  85. "item":{
  86. "@id":"https://www.example.com",
  87. "name":"Baz"
  88. }
  89. }
  90. ]
  91. }
  92. </script>
  93. <script type="application/ld+json">
  94. {
  95. "@context":"https://schema.org",
  96. "@type":"NewsArticle",
  97. "mainEntityOfPage": {
  98. "@type":"WebPage",
  99. "@id": "http://example.com/page.html"
  100. },
  101. "headline": "Foo",
  102. "description": "Bar",
  103. "datePublished": "2022-01-31T19:46:00+00:00",
  104. "author": {
  105. "@type": "Organization",
  106. "name": "Hoge"
  107. },
  108. "publisher": {
  109. "@type": "NewsMediaOrganization",
  110. "name":"Baz",
  111. "url":"https://example.com/"
  112. }
  113. }
  114. </script>
  115. </body>
  116. </html>
  117. HTML
  118. describe '#title' do
  119. it 'returns the title from structured data' do
  120. expect(subject.title).to eq 'Foo'
  121. end
  122. end
  123. describe '#description' do
  124. it 'returns the description from structured data' do
  125. expect(subject.description).to eq 'Bar'
  126. end
  127. end
  128. describe '#provider_name' do
  129. it 'returns the provider name from structured data' do
  130. expect(subject.provider_name).to eq 'Baz'
  131. end
  132. end
  133. describe '#author_name' do
  134. it 'returns the author name from structured data' do
  135. expect(subject.author_name).to eq 'Hoge'
  136. end
  137. end
  138. end
  139. end
  140. end