link_details_extractor_spec.rb 4.2 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159
  1. require 'rails_helper'
  2. RSpec.describe LinkDetailsExtractor do
  3. let(:original_url) { '' }
  4. let(:html) { '' }
  5. let(:html_charset) { nil }
  6. subject { described_class.new(original_url, html, html_charset) }
  7. describe '#canonical_url' do
  8. let(:original_url) { 'https://foo.com/article?bar=baz123' }
  9. context 'when canonical URL points to another host' do
  10. let(:html) { '<!doctype html><link rel="canonical" href="https://bar.com/different-article" />' }
  11. it 'ignores the canonical URLs' do
  12. expect(subject.canonical_url).to eq original_url
  13. end
  14. end
  15. context 'when canonical URL points to the same host' do
  16. let(:html) { '<!doctype html><link rel="canonical" href="https://foo.com/article" />' }
  17. it 'ignores the canonical URLs' do
  18. expect(subject.canonical_url).to eq 'https://foo.com/article'
  19. end
  20. end
  21. context 'when canonical URL is set to "null"' do
  22. let(:html) { '<!doctype html><link rel="canonical" href="null" />' }
  23. it 'ignores the canonical URLs' do
  24. expect(subject.canonical_url).to eq original_url
  25. end
  26. end
  27. end
  28. context 'when structured data is present' do
  29. let(:original_url) { 'https://example.com/page.html' }
  30. context 'and is wrapped in CDATA tags' do
  31. let(:html) { <<-HTML }
  32. <!doctype html>
  33. <html>
  34. <head>
  35. <script type="application/ld+json">
  36. //<![CDATA[
  37. {"@context":"http://schema.org","@type":"NewsArticle","mainEntityOfPage":"https://example.com/page.html","headline":"Foo","datePublished":"2022-01-31T19:53:00+00:00","url":"https://example.com/page.html","description":"Bar","author":{"@type":"Person","name":"Hoge"},"publisher":{"@type":"Organization","name":"Baz"}}
  38. //]]>
  39. </script>
  40. </head>
  41. </html>
  42. HTML
  43. describe '#title' do
  44. it 'returns the title from structured data' do
  45. expect(subject.title).to eq 'Foo'
  46. end
  47. end
  48. describe '#description' do
  49. it 'returns the description from structured data' do
  50. expect(subject.description).to eq 'Bar'
  51. end
  52. end
  53. describe '#provider_name' do
  54. it 'returns the provider name from structured data' do
  55. expect(subject.provider_name).to eq 'Baz'
  56. end
  57. end
  58. describe '#author_name' do
  59. it 'returns the author name from structured data' do
  60. expect(subject.author_name).to eq 'Hoge'
  61. end
  62. end
  63. end
  64. context 'but the first tag is invalid JSON' do
  65. let(:html) { <<-HTML }
  66. <!doctype html>
  67. <html>
  68. <body>
  69. <script type="application/ld+json">
  70. {
  71. "@context":"https://schema.org",
  72. "@type":"ItemList",
  73. "url":"https://example.com/page.html",
  74. "name":"Foo",
  75. "description":"Bar"
  76. },
  77. {
  78. "@context": "https://schema.org",
  79. "@type": "BreadcrumbList",
  80. "itemListElement":[
  81. {
  82. "@type":"ListItem",
  83. "position":1,
  84. "item":{
  85. "@id":"https://www.example.com",
  86. "name":"Baz"
  87. }
  88. }
  89. ]
  90. }
  91. </script>
  92. <script type="application/ld+json">
  93. {
  94. "@context":"https://schema.org",
  95. "@type":"NewsArticle",
  96. "mainEntityOfPage": {
  97. "@type":"WebPage",
  98. "@id": "http://example.com/page.html"
  99. },
  100. "headline": "Foo",
  101. "description": "Bar",
  102. "datePublished": "2022-01-31T19:46:00+00:00",
  103. "author": {
  104. "@type": "Organization",
  105. "name": "Hoge"
  106. },
  107. "publisher": {
  108. "@type": "NewsMediaOrganization",
  109. "name":"Baz",
  110. "url":"https://example.com/"
  111. }
  112. }
  113. </script>
  114. </body>
  115. </html>
  116. HTML
  117. describe '#title' do
  118. it 'returns the title from structured data' do
  119. expect(subject.title).to eq 'Foo'
  120. end
  121. end
  122. describe '#description' do
  123. it 'returns the description from structured data' do
  124. expect(subject.description).to eq 'Bar'
  125. end
  126. end
  127. describe '#provider_name' do
  128. it 'returns the provider name from structured data' do
  129. expect(subject.provider_name).to eq 'Baz'
  130. end
  131. end
  132. describe '#author_name' do
  133. it 'returns the author name from structured data' do
  134. expect(subject.author_name).to eq 'Hoge'
  135. end
  136. end
  137. end
  138. end
  139. end