123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287 |
- # frozen_string_literal: true
- class LinkDetailsExtractor
- include ActionView::Helpers::TagHelper
- include LanguagesHelper
- # Some publications wrap their JSON-LD data in their <script> tags
- # in commented-out CDATA blocks, they need to be removed before
- # attempting to parse JSON
- CDATA_JUNK_PATTERN = %r{^\s*(
- (/\*\s*<!\[CDATA\[\s*\*/) # Block comment style opening
- |
- (//\s*<!\[CDATA\[) # Single-line comment style opening
- |
- (/\*\s*\]\]>\s*\*/) # Block comment style closing
- |
- (//\s*\]\]>) # Single-line comment style closing
- )\s*$}x
- class StructuredData
- SUPPORTED_TYPES = %w(
- NewsArticle
- WebPage
- ).freeze
- def initialize(data)
- @data = data
- end
- def headline
- json['headline']
- end
- def description
- json['description']
- end
- def language
- lang = json['inLanguage']
- lang = lang.first if lang.is_a?(Array)
- lang.is_a?(Hash) ? (lang['alternateName'] || lang['name']) : lang
- end
- def type
- json['@type']
- end
- def image
- obj = first_of_value(json['image'])
- return obj['url'] if obj.is_a?(Hash)
- obj
- end
- def date_published
- json['datePublished']
- end
- def date_modified
- json['dateModified']
- end
- def author_name
- author['name']
- end
- def author_url
- author['url']
- end
- def publisher_name
- publisher['name']
- end
- def publisher_logo
- publisher.dig('logo', 'url')
- end
- def valid?
- json.present?
- end
- private
- def author
- first_of_value(json['author']) || {}
- end
- def publisher
- first_of_value(json['publisher']) || {}
- end
- def first_of_value(arr)
- arr.is_a?(Array) ? arr.first : arr
- end
- def root_array(root)
- root.is_a?(Array) ? root : [root]
- end
- def json
- @json ||= root_array(Oj.load(@data)).find { |obj| SUPPORTED_TYPES.include?(obj['@type']) } || {}
- end
- end
- def initialize(original_url, html, html_charset)
- @original_url = Addressable::URI.parse(original_url)
- @html = html
- @html_charset = html_charset
- end
- def to_preview_card_attributes
- {
- title: title || '',
- description: description || '',
- image_remote_url: image,
- image_description: image_alt || '',
- type: type,
- link_type: link_type,
- width: width || 0,
- height: height || 0,
- html: html || '',
- provider_name: provider_name || '',
- provider_url: provider_url || '',
- author_name: author_name || '',
- author_url: author_url || '',
- embed_url: embed_url || '',
- language: language,
- published_at: published_at.presence,
- }
- end
- def type
- player_url.present? ? :video : :link
- end
- def link_type
- if structured_data&.type == 'NewsArticle' || opengraph_tag('og:type') == 'article'
- :article
- else
- :unknown
- end
- end
- def html
- player_url.present? ? content_tag(:iframe, nil, src: player_url, width: width, height: height, allowfullscreen: 'true', allowtransparency: 'true', scrolling: 'no', frameborder: '0') : nil
- end
- def width
- opengraph_tag('twitter:player:width')
- end
- def height
- opengraph_tag('twitter:player:height')
- end
- def title
- html_entities.decode(structured_data&.headline || opengraph_tag('og:title') || document.xpath('//title').map(&:content).first)
- end
- def description
- html_entities.decode(structured_data&.description || opengraph_tag('og:description') || meta_tag('description'))
- end
- def published_at
- structured_data&.date_published || opengraph_tag('article:published_time')
- end
- def image
- valid_url_or_nil(opengraph_tag('og:image'))
- end
- def image_alt
- opengraph_tag('og:image:alt')
- end
- def canonical_url
- valid_url_or_nil(link_tag('canonical') || opengraph_tag('og:url'), same_origin_only: true) || @original_url.to_s
- end
- def provider_name
- html_entities.decode(structured_data&.publisher_name || opengraph_tag('og:site_name'))
- end
- def provider_url
- valid_url_or_nil(host_to_url(opengraph_tag('og:site')))
- end
- def author_name
- html_entities.decode(structured_data&.author_name || opengraph_tag('og:author') || opengraph_tag('og:author:username'))
- end
- def author_url
- structured_data&.author_url
- end
- def embed_url
- valid_url_or_nil(opengraph_tag('twitter:player:stream'))
- end
- def language
- valid_locale_or_nil(structured_data&.language || opengraph_tag('og:locale') || document.xpath('//html').pick('lang'))
- end
- def icon
- valid_url_or_nil(structured_data&.publisher_icon || link_tag('apple-touch-icon') || link_tag('shortcut icon'))
- end
- private
- def player_url
- valid_url_or_nil(opengraph_tag('twitter:player'))
- end
- def host_to_url(str)
- return if str.blank?
- str.start_with?(%r{https?://}) ? str : "http://#{str}"
- end
- def valid_url_or_nil(str, same_origin_only: false)
- return if str.blank? || str == 'null'
- url = @original_url + Addressable::URI.parse(str)
- return if url.host.blank? || !%w(http https).include?(url.scheme) || (same_origin_only && url.host != @original_url.host)
- url.to_s
- rescue Addressable::URI::InvalidURIError
- nil
- end
- def link_tag(name)
- document.xpath("//link[@rel=\"#{name}\"]").pick('href')
- end
- def opengraph_tag(name)
- document.xpath("//meta[@property=\"#{name}\" or @name=\"#{name}\"]").pick('content')
- end
- def meta_tag(name)
- document.xpath("//meta[@name=\"#{name}\"]").pick('content')
- end
- def structured_data
- # Some publications have more than one JSON-LD definition on the page,
- # and some of those definitions aren't valid JSON either, so we have
- # to loop through here until we find something that is the right type
- # and doesn't break
- @structured_data ||= document.xpath('//script[@type="application/ld+json"]').filter_map do |element|
- json_ld = element.content&.gsub(CDATA_JUNK_PATTERN, '')
- next if json_ld.blank?
- structured_data = StructuredData.new(html_entities.decode(json_ld))
- next unless structured_data.valid?
- structured_data
- rescue Oj::ParseError, EncodingError
- Rails.logger.debug { "Invalid JSON-LD in #{@original_url}" }
- next
- end.first
- end
- def document
- @document ||= Nokogiri::HTML(@html, nil, encoding)
- end
- def encoding
- @encoding ||= begin
- guess = detector.detect(@html, @html_charset)
- guess&.fetch(:confidence, 0).to_i > 60 ? guess&.fetch(:encoding, nil) : nil
- end
- end
- def detector
- @detector ||= CharlockHolmes::EncodingDetector.new.tap do |detector|
- detector.strip_tags = true
- end
- end
- def html_entities
- @html_entities ||= HTMLEntities.new
- end
- end
|