1
0

link_details_extractor.rb 6.4 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277
  1. # frozen_string_literal: true
  2. class LinkDetailsExtractor
  3. include ActionView::Helpers::TagHelper
  4. include LanguagesHelper
  5. # Some publications wrap their JSON-LD data in their <script> tags
  6. # in commented-out CDATA blocks, they need to be removed before
  7. # attempting to parse JSON
  8. CDATA_JUNK_PATTERN = %r{^[\s]*(
  9. (/\*[\s]*<!\[CDATA\[[\s]*\*/) # Block comment style opening
  10. |
  11. (//[\s]*<!\[CDATA\[) # Single-line comment style opening
  12. |
  13. (/\*[\s]*\]\]>[\s]*\*/) # Block comment style closing
  14. |
  15. (//[\s]*\]\]>) # Single-line comment style closing
  16. )[\s]*$}x
  17. class StructuredData
  18. SUPPORTED_TYPES = %w(
  19. NewsArticle
  20. WebPage
  21. ).freeze
  22. def initialize(data)
  23. @data = data
  24. end
  25. def headline
  26. json['headline']
  27. end
  28. def description
  29. json['description']
  30. end
  31. def language
  32. json['inLanguage']
  33. end
  34. def type
  35. json['@type']
  36. end
  37. def image
  38. obj = first_of_value(json['image'])
  39. return obj['url'] if obj.is_a?(Hash)
  40. obj
  41. end
  42. def date_published
  43. json['datePublished']
  44. end
  45. def date_modified
  46. json['dateModified']
  47. end
  48. def author_name
  49. author['name']
  50. end
  51. def author_url
  52. author['url']
  53. end
  54. def publisher_name
  55. publisher['name']
  56. end
  57. def publisher_logo
  58. publisher.dig('logo', 'url')
  59. end
  60. def valid?
  61. json.present?
  62. end
  63. private
  64. def author
  65. first_of_value(json['author']) || {}
  66. end
  67. def publisher
  68. first_of_value(json['publisher']) || {}
  69. end
  70. def first_of_value(arr)
  71. arr.is_a?(Array) ? arr.first : arr
  72. end
  73. def root_array(root)
  74. root.is_a?(Array) ? root : [root]
  75. end
  76. def json
  77. @json ||= root_array(Oj.load(@data)).find { |obj| SUPPORTED_TYPES.include?(obj['@type']) } || {}
  78. end
  79. end
  80. def initialize(original_url, html, html_charset)
  81. @original_url = Addressable::URI.parse(original_url)
  82. @html = html
  83. @html_charset = html_charset
  84. end
  85. def to_preview_card_attributes
  86. {
  87. title: title || '',
  88. description: description || '',
  89. image_remote_url: image,
  90. type: type,
  91. link_type: link_type,
  92. width: width || 0,
  93. height: height || 0,
  94. html: html || '',
  95. provider_name: provider_name || '',
  96. provider_url: provider_url || '',
  97. author_name: author_name || '',
  98. author_url: author_url || '',
  99. embed_url: embed_url || '',
  100. language: language,
  101. }
  102. end
  103. def type
  104. player_url.present? ? :video : :link
  105. end
  106. def link_type
  107. if structured_data&.type == 'NewsArticle' || opengraph_tag('og:type') == 'article'
  108. :article
  109. else
  110. :unknown
  111. end
  112. end
  113. def html
  114. player_url.present? ? content_tag(:iframe, nil, src: player_url, width: width, height: height, allowfullscreen: 'true', allowtransparency: 'true', scrolling: 'no', frameborder: '0') : nil
  115. end
  116. def width
  117. opengraph_tag('twitter:player:width')
  118. end
  119. def height
  120. opengraph_tag('twitter:player:height')
  121. end
  122. def title
  123. html_entities.decode(structured_data&.headline || opengraph_tag('og:title') || document.xpath('//title').map(&:content).first)
  124. end
  125. def description
  126. html_entities.decode(structured_data&.description || opengraph_tag('og:description') || meta_tag('description'))
  127. end
  128. def image
  129. valid_url_or_nil(opengraph_tag('og:image'))
  130. end
  131. def canonical_url
  132. valid_url_or_nil(link_tag('canonical') || opengraph_tag('og:url'), same_origin_only: true) || @original_url.to_s
  133. end
  134. def provider_name
  135. html_entities.decode(structured_data&.publisher_name || opengraph_tag('og:site_name'))
  136. end
  137. def provider_url
  138. valid_url_or_nil(host_to_url(opengraph_tag('og:site')))
  139. end
  140. def author_name
  141. html_entities.decode(structured_data&.author_name || opengraph_tag('og:author') || opengraph_tag('og:author:username'))
  142. end
  143. def author_url
  144. structured_data&.author_url
  145. end
  146. def embed_url
  147. valid_url_or_nil(opengraph_tag('twitter:player:stream'))
  148. end
  149. def language
  150. valid_locale_or_nil(structured_data&.language || opengraph_tag('og:locale') || document.xpath('//html').map { |element| element['lang'] }.first)
  151. end
  152. def icon
  153. valid_url_or_nil(structured_data&.publisher_icon || link_tag('apple-touch-icon') || link_tag('shortcut icon'))
  154. end
  155. private
  156. def player_url
  157. valid_url_or_nil(opengraph_tag('twitter:player'))
  158. end
  159. def host_to_url(str)
  160. return if str.blank?
  161. str.start_with?(/https?:\/\//) ? str : "http://#{str}"
  162. end
  163. def valid_url_or_nil(str, same_origin_only: false)
  164. return if str.blank? || str == 'null'
  165. url = @original_url + Addressable::URI.parse(str)
  166. return if url.host.blank? || !%w(http https).include?(url.scheme) || (same_origin_only && url.host != @original_url.host)
  167. url.to_s
  168. rescue Addressable::URI::InvalidURIError
  169. nil
  170. end
  171. def link_tag(name)
  172. document.xpath("//link[@rel=\"#{name}\"]").map { |link| link['href'] }.first
  173. end
  174. def opengraph_tag(name)
  175. document.xpath("//meta[@property=\"#{name}\" or @name=\"#{name}\"]").map { |meta| meta['content'] }.first
  176. end
  177. def meta_tag(name)
  178. document.xpath("//meta[@name=\"#{name}\"]").map { |meta| meta['content'] }.first
  179. end
  180. def structured_data
  181. @structured_data ||= begin
  182. # Some publications have more than one JSON-LD definition on the page,
  183. # and some of those definitions aren't valid JSON either, so we have
  184. # to loop through here until we find something that is the right type
  185. # and doesn't break
  186. document.xpath('//script[@type="application/ld+json"]').filter_map do |element|
  187. json_ld = element.content&.gsub(CDATA_JUNK_PATTERN, '')
  188. next if json_ld.blank?
  189. structured_data = StructuredData.new(html_entities.decode(json_ld))
  190. next unless structured_data.valid?
  191. structured_data
  192. rescue Oj::ParseError, EncodingError
  193. Rails.logger.debug { "Invalid JSON-LD in #{@original_url}" }
  194. next
  195. end.first
  196. end
  197. end
  198. def document
  199. @document ||= Nokogiri::HTML(@html, nil, encoding)
  200. end
  201. def encoding
  202. @encoding ||= begin
  203. guess = detector.detect(@html, @html_charset)
  204. guess&.fetch(:confidence, 0).to_i > 60 ? guess&.fetch(:encoding, nil) : nil
  205. end
  206. end
  207. def detector
  208. @detector ||= CharlockHolmes::EncodingDetector.new.tap do |detector|
  209. detector.strip_tags = true
  210. end
  211. end
  212. def html_entities
  213. @html_entities ||= HTMLEntities.new
  214. end
  215. end