link_details_extractor.rb 6.6 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286
  1. # frozen_string_literal: true
  2. class LinkDetailsExtractor
  3. include ActionView::Helpers::TagHelper
  4. include LanguagesHelper
  5. # Some publications wrap their JSON-LD data in their <script> tags
  6. # in commented-out CDATA blocks, they need to be removed before
  7. # attempting to parse JSON
  8. CDATA_JUNK_PATTERN = %r{^\s*(
  9. (/\*\s*<!\[CDATA\[\s*\*/) # Block comment style opening
  10. |
  11. (//\s*<!\[CDATA\[) # Single-line comment style opening
  12. |
  13. (/\*\s*\]\]>\s*\*/) # Block comment style closing
  14. |
  15. (//\s*\]\]>) # Single-line comment style closing
  16. )\s*$}x
  17. class StructuredData
  18. SUPPORTED_TYPES = %w(
  19. NewsArticle
  20. WebPage
  21. ).freeze
  22. def initialize(data)
  23. @data = data
  24. end
  25. def headline
  26. json['headline']
  27. end
  28. def description
  29. json['description']
  30. end
  31. def language
  32. lang = json['inLanguage']
  33. lang.is_a?(Hash) ? (lang['alternateName'] || lang['name']) : lang
  34. end
  35. def type
  36. json['@type']
  37. end
  38. def image
  39. obj = first_of_value(json['image'])
  40. return obj['url'] if obj.is_a?(Hash)
  41. obj
  42. end
  43. def date_published
  44. json['datePublished']
  45. end
  46. def date_modified
  47. json['dateModified']
  48. end
  49. def author_name
  50. author['name']
  51. end
  52. def author_url
  53. author['url']
  54. end
  55. def publisher_name
  56. publisher['name']
  57. end
  58. def publisher_logo
  59. publisher.dig('logo', 'url')
  60. end
  61. def valid?
  62. json.present?
  63. end
  64. private
  65. def author
  66. first_of_value(json['author']) || {}
  67. end
  68. def publisher
  69. first_of_value(json['publisher']) || {}
  70. end
  71. def first_of_value(arr)
  72. arr.is_a?(Array) ? arr.first : arr
  73. end
  74. def root_array(root)
  75. root.is_a?(Array) ? root : [root]
  76. end
  77. def json
  78. @json ||= root_array(Oj.load(@data)).find { |obj| SUPPORTED_TYPES.include?(obj['@type']) } || {}
  79. end
  80. end
  81. def initialize(original_url, html, html_charset)
  82. @original_url = Addressable::URI.parse(original_url)
  83. @html = html
  84. @html_charset = html_charset
  85. end
  86. def to_preview_card_attributes
  87. {
  88. title: title || '',
  89. description: description || '',
  90. image_remote_url: image,
  91. image_description: image_alt || '',
  92. type: type,
  93. link_type: link_type,
  94. width: width || 0,
  95. height: height || 0,
  96. html: html || '',
  97. provider_name: provider_name || '',
  98. provider_url: provider_url || '',
  99. author_name: author_name || '',
  100. author_url: author_url || '',
  101. embed_url: embed_url || '',
  102. language: language,
  103. published_at: published_at.presence,
  104. }
  105. end
  106. def type
  107. player_url.present? ? :video : :link
  108. end
  109. def link_type
  110. if structured_data&.type == 'NewsArticle' || opengraph_tag('og:type') == 'article'
  111. :article
  112. else
  113. :unknown
  114. end
  115. end
  116. def html
  117. player_url.present? ? content_tag(:iframe, nil, src: player_url, width: width, height: height, allowfullscreen: 'true', allowtransparency: 'true', scrolling: 'no', frameborder: '0') : nil
  118. end
  119. def width
  120. opengraph_tag('twitter:player:width')
  121. end
  122. def height
  123. opengraph_tag('twitter:player:height')
  124. end
  125. def title
  126. html_entities.decode(structured_data&.headline || opengraph_tag('og:title') || document.xpath('//title').map(&:content).first)
  127. end
  128. def description
  129. html_entities.decode(structured_data&.description || opengraph_tag('og:description') || meta_tag('description'))
  130. end
  131. def published_at
  132. structured_data&.date_published || opengraph_tag('article:published_time')
  133. end
  134. def image
  135. valid_url_or_nil(opengraph_tag('og:image'))
  136. end
  137. def image_alt
  138. opengraph_tag('og:image:alt')
  139. end
  140. def canonical_url
  141. valid_url_or_nil(link_tag('canonical') || opengraph_tag('og:url'), same_origin_only: true) || @original_url.to_s
  142. end
  143. def provider_name
  144. html_entities.decode(structured_data&.publisher_name || opengraph_tag('og:site_name'))
  145. end
  146. def provider_url
  147. valid_url_or_nil(host_to_url(opengraph_tag('og:site')))
  148. end
  149. def author_name
  150. html_entities.decode(structured_data&.author_name || opengraph_tag('og:author') || opengraph_tag('og:author:username'))
  151. end
  152. def author_url
  153. structured_data&.author_url
  154. end
  155. def embed_url
  156. valid_url_or_nil(opengraph_tag('twitter:player:stream'))
  157. end
  158. def language
  159. valid_locale_or_nil(structured_data&.language || opengraph_tag('og:locale') || document.xpath('//html').pick('lang'))
  160. end
  161. def icon
  162. valid_url_or_nil(structured_data&.publisher_icon || link_tag('apple-touch-icon') || link_tag('shortcut icon'))
  163. end
  164. private
  165. def player_url
  166. valid_url_or_nil(opengraph_tag('twitter:player'))
  167. end
  168. def host_to_url(str)
  169. return if str.blank?
  170. str.start_with?(%r{https?://}) ? str : "http://#{str}"
  171. end
  172. def valid_url_or_nil(str, same_origin_only: false)
  173. return if str.blank? || str == 'null'
  174. url = @original_url + Addressable::URI.parse(str)
  175. return if url.host.blank? || !%w(http https).include?(url.scheme) || (same_origin_only && url.host != @original_url.host)
  176. url.to_s
  177. rescue Addressable::URI::InvalidURIError
  178. nil
  179. end
  180. def link_tag(name)
  181. document.xpath("//link[@rel=\"#{name}\"]").pick('href')
  182. end
  183. def opengraph_tag(name)
  184. document.xpath("//meta[@property=\"#{name}\" or @name=\"#{name}\"]").pick('content')
  185. end
  186. def meta_tag(name)
  187. document.xpath("//meta[@name=\"#{name}\"]").pick('content')
  188. end
  189. def structured_data
  190. # Some publications have more than one JSON-LD definition on the page,
  191. # and some of those definitions aren't valid JSON either, so we have
  192. # to loop through here until we find something that is the right type
  193. # and doesn't break
  194. @structured_data ||= document.xpath('//script[@type="application/ld+json"]').filter_map do |element|
  195. json_ld = element.content&.gsub(CDATA_JUNK_PATTERN, '')
  196. next if json_ld.blank?
  197. structured_data = StructuredData.new(html_entities.decode(json_ld))
  198. next unless structured_data.valid?
  199. structured_data
  200. rescue Oj::ParseError, EncodingError
  201. Rails.logger.debug { "Invalid JSON-LD in #{@original_url}" }
  202. next
  203. end.first
  204. end
  205. def document
  206. @document ||= Nokogiri::HTML(@html, nil, encoding)
  207. end
  208. def encoding
  209. @encoding ||= begin
  210. guess = detector.detect(@html, @html_charset)
  211. guess&.fetch(:confidence, 0).to_i > 60 ? guess&.fetch(:encoding, nil) : nil
  212. end
  213. end
  214. def detector
  215. @detector ||= CharlockHolmes::EncodingDetector.new.tap do |detector|
  216. detector.strip_tags = true
  217. end
  218. end
  219. def html_entities
  220. @html_entities ||= HTMLEntities.new
  221. end
  222. end