spam_check.rb 5.3 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203
  1. # frozen_string_literal: true
  2. class SpamCheck
  3. include Redisable
  4. include ActionView::Helpers::TextHelper
  5. # Threshold over which two Nilsimsa values are considered
  6. # to refer to the same text
  7. NILSIMSA_COMPARE_THRESHOLD = 95
  8. # Nilsimsa doesn't work well on small inputs, so below
  9. # this size, we check only for exact matches with MD5
  10. NILSIMSA_MIN_SIZE = 10
  11. # How long to keep the trail of digests between updates,
  12. # there is no reason to store it forever
  13. EXPIRE_SET_AFTER = 1.week.seconds
  14. # How many digests to keep in an account's trail. If it's
  15. # too small, spam could rotate around different message templates
  16. MAX_TRAIL_SIZE = 10
  17. # How many detected duplicates to allow through before
  18. # considering the message as spam
  19. THRESHOLD = 5
  20. def initialize(status)
  21. @account = status.account
  22. @status = status
  23. end
  24. def skip?
  25. disabled? || already_flagged? || trusted? || no_unsolicited_mentions? || solicited_reply?
  26. end
  27. def spam?
  28. if insufficient_data?
  29. false
  30. elsif nilsimsa?
  31. digests_over_threshold?('nilsimsa') { |_, other_digest| nilsimsa_compare_value(digest, other_digest) >= NILSIMSA_COMPARE_THRESHOLD }
  32. else
  33. digests_over_threshold?('md5') { |_, other_digest| other_digest == digest }
  34. end
  35. end
  36. def flag!
  37. auto_silence_account!
  38. auto_report_status!
  39. end
  40. def remember!
  41. # The scores in sorted sets don't actually have enough bits to hold an exact
  42. # value of our snowflake IDs, so we use it only for its ordering property. To
  43. # get the correct status ID back, we have to save it in the string value
  44. redis.zadd(redis_key, @status.id, digest_with_algorithm)
  45. redis.zremrangebyrank(redis_key, 0, -(MAX_TRAIL_SIZE + 1))
  46. redis.expire(redis_key, EXPIRE_SET_AFTER)
  47. end
  48. def reset!
  49. redis.del(redis_key)
  50. end
  51. def hashable_text
  52. return @hashable_text if defined?(@hashable_text)
  53. @hashable_text = @status.text
  54. @hashable_text = remove_mentions(@hashable_text)
  55. @hashable_text = strip_tags(@hashable_text) unless @status.local?
  56. @hashable_text = normalize_unicode(@status.spoiler_text + ' ' + @hashable_text)
  57. @hashable_text = remove_whitespace(@hashable_text)
  58. end
  59. def insufficient_data?
  60. hashable_text.blank?
  61. end
  62. def digest
  63. @digest ||= begin
  64. if nilsimsa?
  65. Nilsimsa.new(hashable_text).hexdigest
  66. else
  67. Digest::MD5.hexdigest(hashable_text)
  68. end
  69. end
  70. end
  71. def digest_with_algorithm
  72. if nilsimsa?
  73. ['nilsimsa', digest, @status.id].join(':')
  74. else
  75. ['md5', digest, @status.id].join(':')
  76. end
  77. end
  78. class << self
  79. def perform(status)
  80. spam_check = new(status)
  81. return if spam_check.skip?
  82. if spam_check.spam?
  83. spam_check.flag!
  84. else
  85. spam_check.remember!
  86. end
  87. end
  88. end
  89. private
  90. def disabled?
  91. !Setting.spam_check_enabled
  92. end
  93. def remove_mentions(text)
  94. return text.gsub(Account::MENTION_RE, '') if @status.local?
  95. Nokogiri::HTML.fragment(text).tap do |html|
  96. mentions = @status.mentions.map { |mention| ActivityPub::TagManager.instance.url_for(mention.account) }
  97. html.traverse do |element|
  98. element.unlink if element.name == 'a' && mentions.include?(element['href'])
  99. end
  100. end.to_s
  101. end
  102. def normalize_unicode(text)
  103. text.unicode_normalize(:nfkc).downcase
  104. end
  105. def remove_whitespace(text)
  106. text.gsub(/\s+/, ' ').strip
  107. end
  108. def auto_silence_account!
  109. @account.silence!
  110. end
  111. def auto_report_status!
  112. status_ids = Status.where(visibility: %i(public unlisted)).where(id: matching_status_ids).pluck(:id) + [@status.id] if @status.distributable?
  113. ReportService.new.call(Account.representative, @account, status_ids: status_ids, comment: I18n.t('spam_check.spam_detected_and_silenced'))
  114. end
  115. def already_flagged?
  116. @account.silenced?
  117. end
  118. def trusted?
  119. @account.trust_level > Account::TRUST_LEVELS[:untrusted]
  120. end
  121. def no_unsolicited_mentions?
  122. @status.mentions.all? { |mention| mention.silent? || (!@account.local? && !mention.account.local?) || mention.account.following?(@account) }
  123. end
  124. def solicited_reply?
  125. !@status.thread.nil? && @status.thread.mentions.where(account: @account).exists?
  126. end
  127. def nilsimsa_compare_value(first, second)
  128. first = [first].pack('H*')
  129. second = [second].pack('H*')
  130. bits = 0
  131. 0.upto(31) do |i|
  132. bits += Nilsimsa::POPC[255 & (first[i].ord ^ second[i].ord)].ord
  133. end
  134. 128 - bits # -128 <= Nilsimsa Compare Value <= 128
  135. end
  136. def nilsimsa?
  137. hashable_text.size > NILSIMSA_MIN_SIZE
  138. end
  139. def other_digests
  140. redis.zrange(redis_key, 0, -1)
  141. end
  142. def digests_over_threshold?(filter_algorithm)
  143. other_digests.select do |record|
  144. algorithm, other_digest, status_id = record.split(':')
  145. next unless algorithm == filter_algorithm
  146. yield algorithm, other_digest, status_id
  147. end.size >= THRESHOLD
  148. end
  149. def matching_status_ids
  150. if nilsimsa?
  151. other_digests.select { |record| record.start_with?('nilsimsa') && nilsimsa_compare_value(digest, record.split(':')[1]) >= NILSIMSA_COMPARE_THRESHOLD }.map { |record| record.split(':')[2] }.compact
  152. else
  153. other_digests.select { |record| record.start_with?('md5') && record.split(':')[1] == digest }.map { |record| record.split(':')[2] }.compact
  154. end
  155. end
  156. def redis_key
  157. @redis_key ||= "spam_check:#{@account.id}"
  158. end
  159. end