Browse Source

Add more accurate hashtag search (#11579)

* Add more accurate hashtag search

Using ElasticSearch to index hashtags with edge n-grams and score
them by usage within the last 7 days since last activity. Only
hashtags that have been reviewed and are listable can appear in
searches, unless they match the query exactly

* Fix search analyzer dropping non-ascii characters
Eugen Rochko 4 years ago
parent
commit
cc0a55cf9a

+ 37 - 0
app/chewy/tags_index.rb

@@ -0,0 +1,37 @@
+# frozen_string_literal: true
+
+class TagsIndex < Chewy::Index
+  settings index: { refresh_interval: '15m' }, analysis: {
+    analyzer: {
+      content: {
+        tokenizer: 'keyword',
+        filter: %w(lowercase asciifolding cjk_width),
+      },
+
+      edge_ngram: {
+        tokenizer: 'edge_ngram',
+        filter: %w(lowercase asciifolding cjk_width),
+      },
+    },
+
+    tokenizer: {
+      edge_ngram: {
+        type: 'edge_ngram',
+        min_gram: 2,
+        max_gram: 15,
+      },
+    },
+  }
+
+  define_type ::Tag.listable, delete_if: ->(tag) { tag.destroyed? || !tag.listable? } do
+    root date_detection: false do
+      field :name, type: 'text', analyzer: 'content' do
+        field :edge_ngram, type: 'text', analyzer: 'edge_ngram', search_analyzer: 'content'
+      end
+
+      field :reviewed, type: 'boolean', value: ->(tag) { tag.reviewed? }
+      field :usage, type: 'long', value: ->(tag) { tag.history.reduce(0) { |total, day| total + day[:accounts].to_i } }
+      field :last_status_at, type: 'date', value: ->(tag) { tag.last_status_at || tag.created_at }
+    end
+  end
+end

+ 10 - 4
app/models/tag.rb

@@ -13,6 +13,8 @@
 #  listable            :boolean
 #  reviewed_at         :datetime
 #  requested_review_at :datetime
+#  last_status_at      :datetime
+#  last_trend_at       :datetime
 #
 
 class Tag < ApplicationRecord
@@ -33,7 +35,8 @@ class Tag < ApplicationRecord
   scope :unreviewed, -> { where(reviewed_at: nil) }
   scope :pending_review, -> { unreviewed.where.not(requested_review_at: nil) }
   scope :usable, -> { where(usable: [true, nil]) }
-  scope :discoverable, -> { where(listable: [true, nil]).joins(:account_tag_stat).where(AccountTagStat.arel_table[:accounts_count].gt(0)).order(Arel.sql('account_tag_stats.accounts_count desc')) }
+  scope :listable, -> { where(listable: [true, nil]) }
+  scope :discoverable, -> { listable.joins(:account_tag_stat).where(AccountTagStat.arel_table[:accounts_count].gt(0)).order(Arel.sql('account_tag_stats.accounts_count desc')) }
   scope :most_used, ->(account) { joins(:statuses).where(statuses: { account: account }).group(:id).order(Arel.sql('count(*) desc')) }
 
   delegate :accounts_count,
@@ -44,6 +47,8 @@ class Tag < ApplicationRecord
 
   after_save :save_account_tag_stat
 
+  update_index('tags#tag', :self) if Chewy.enabled?
+
   def account_tag_stat
     super || build_account_tag_stat
   end
@@ -121,9 +126,10 @@ class Tag < ApplicationRecord
       normalized_term = normalize(term.strip).mb_chars.downcase.to_s
       pattern         = sanitize_sql_like(normalized_term) + '%'
 
-      Tag.where(arel_table[:name].lower.matches(pattern))
-         .where(arel_table[:score].gt(0).or(arel_table[:name].lower.eq(normalized_term)))
-         .order(Arel.sql('length(name) ASC, score DESC, name ASC'))
+      Tag.listable
+         .where(arel_table[:name].lower.matches(pattern))
+         .where(arel_table[:name].lower.eq(normalized_term).or(arel_table[:reviewed_at].not_eq(nil)))
+         .order(Arel.sql('length(name) ASC, name ASC'))
          .limit(limit)
          .offset(offset)
     end

+ 3 - 0
app/models/trending_tags.rb

@@ -17,6 +17,9 @@ class TrendingTags
       increment_historical_use!(tag.id, at_time)
       increment_unique_use!(tag.id, account.id, at_time)
       increment_vote!(tag, at_time)
+
+      tag.update(last_status_at: Time.now.utc) if tag.last_status_at.nil? || tag.last_status_at < 12.hours.ago
+      tag.update(last_trend_at: Time.now.utc)  if trending?(tag) && (tag.last_trend_at.nil? || tag.last_trend_at < 12.hours.ago)
     end
 
     def get(limit, filtered: true)

+ 1 - 1
app/services/account_search_service.rb

@@ -109,7 +109,7 @@ class AccountSearchService < BaseService
       field_value_factor: {
         field: 'followers_count',
         modifier: 'log2p',
-        missing: 1,
+        missing: 0,
       },
     }
   end

+ 4 - 4
app/services/search_service.rb

@@ -57,10 +57,10 @@ class SearchService < BaseService
   end
 
   def perform_hashtags_search!
-    Tag.search_for(
-      @query.gsub(/\A#/, ''),
-      @limit,
-      @offset
+    TagSearchService.new.call(
+      @query,
+      limit: @limit,
+      offset: @offset
     )
   end
 

+ 82 - 0
app/services/tag_search_service.rb

@@ -0,0 +1,82 @@
+# frozen_string_literal: true
+
+class TagSearchService < BaseService
+  def call(query, options = {})
+    @query  = query.strip.gsub(/\A#/, '')
+    @offset = options[:offset].to_i
+    @limit  = options[:limit].to_i
+
+    if Chewy.enabled?
+      from_elasticsearch
+    else
+      from_database
+    end
+  end
+
+  private
+
+  def from_elasticsearch
+    query = {
+      function_score: {
+        query: {
+          multi_match: {
+            query: @query,
+            fields: %w(name.edge_ngram name),
+            type: 'most_fields',
+            operator: 'and',
+          },
+        },
+
+        functions: [
+          {
+            field_value_factor: {
+              field: 'usage',
+              modifier: 'log2p',
+              missing: 0,
+            },
+          },
+
+          {
+            gauss: {
+              last_status_at: {
+                scale: '7d',
+                offset: '14d',
+                decay: 0.5,
+              },
+            },
+          },
+        ],
+
+        boost_mode: 'multiply',
+      },
+    }
+
+    filter = {
+      bool: {
+        should: [
+          {
+            term: {
+              reviewed: {
+                value: true,
+              },
+            },
+          },
+
+          {
+            term: {
+              name: {
+                value: @query,
+              },
+            },
+          },
+        ],
+      },
+    }
+
+    TagsIndex.query(query).filter(filter).limit(@limit).offset(@offset).objects.compact
+  end
+
+  def from_database
+    Tag.search_for(@query, @limit, @offset)
+  end
+end

+ 1 - 1
config/locales/simple_form.en.yml

@@ -142,7 +142,7 @@ en:
         report: Send e-mail when a new report is submitted
         trending_tag: Send e-mail when an unreviewed hashtag is trending
       tag:
-        listable: Allow this hashtag to appear on the profile directory
+        listable: Allow this hashtag to appear in searches and on the profile directory
         trendable: Allow this hashtag to appear under trends
         usable: Allow toots to use this hashtag
     'no': 'No'

+ 6 - 0
db/migrate/20190815225426_add_last_status_at_to_tags.rb

@@ -0,0 +1,6 @@
+class AddLastStatusAtToTags < ActiveRecord::Migration[5.2]
+  def change
+    add_column :tags, :last_status_at, :datetime
+    add_column :tags, :last_trend_at, :datetime
+  end
+end

+ 3 - 1
db/schema.rb

@@ -10,7 +10,7 @@
 #
 # It's strongly recommended that you check this file into your version control system.
 
-ActiveRecord::Schema.define(version: 2019_08_07_135426) do
+ActiveRecord::Schema.define(version: 2019_08_15_225426) do
 
   # These are extensions that must be enabled in order to support this database
   enable_extension "plpgsql"
@@ -667,6 +667,8 @@ ActiveRecord::Schema.define(version: 2019_08_07_135426) do
     t.boolean "listable"
     t.datetime "reviewed_at"
     t.datetime "requested_review_at"
+    t.datetime "last_status_at"
+    t.datetime "last_trend_at"
     t.index "lower((name)::text)", name: "index_tags_on_name_lower", unique: true
   end
 

+ 2 - 2
spec/models/tag_spec.rb

@@ -136,8 +136,8 @@ RSpec.describe Tag, type: :model do
     end
 
     it 'finds the exact matching tag as the first item' do
-      similar_tag = Fabricate(:tag, name: "matchlater", score: 1)
-      tag = Fabricate(:tag, name: "match", score: 1)
+      similar_tag = Fabricate(:tag, name: "matchlater", reviewed_at: Time.now.utc)
+      tag = Fabricate(:tag, name: "match", reviewed_at: Time.now.utc)
 
       results = Tag.search_for("match")