Instead of detecting spam on first duplicate message, add a threshold of 5 such messages to reduce false positivesmaster^2
@@ -408,15 +408,7 @@ class ActivityPub::Activity::Create < ActivityPub::Activity | |||
end | |||
def check_for_spam | |||
spam_check = SpamCheck.new(@status) | |||
return if spam_check.skip? | |||
if spam_check.spam? | |||
spam_check.flag! | |||
else | |||
spam_check.remember! | |||
end | |||
SpamCheck.perform(@status) | |||
end | |||
def forward_for_reply | |||
@@ -4,9 +4,25 @@ class SpamCheck | |||
include Redisable | |||
include ActionView::Helpers::TextHelper | |||
# Threshold over which two Nilsimsa values are considered | |||
# to refer to the same text | |||
NILSIMSA_COMPARE_THRESHOLD = 95 | |||
NILSIMSA_MIN_SIZE = 10 | |||
EXPIRE_SET_AFTER = 1.week.seconds | |||
# Nilsimsa doesn't work well on small inputs, so below | |||
# this size, we check only for exact matches with MD5 | |||
NILSIMSA_MIN_SIZE = 10 | |||
# How long to keep the trail of digests between updates, | |||
# there is no reason to store it forever | |||
EXPIRE_SET_AFTER = 1.week.seconds | |||
# How many digests to keep in an account's trail. If it's | |||
# too small, spam could rotate around different message templates | |||
MAX_TRAIL_SIZE = 10 | |||
# How many detected duplicates to allow through before | |||
# considering the message as spam | |||
THRESHOLD = 5 | |||
def initialize(status) | |||
@account = status.account | |||
@@ -21,9 +37,9 @@ class SpamCheck | |||
if insufficient_data? | |||
false | |||
elsif nilsimsa? | |||
any_other_digest?('nilsimsa') { |_, other_digest| nilsimsa_compare_value(digest, other_digest) >= NILSIMSA_COMPARE_THRESHOLD } | |||
digests_over_threshold?('nilsimsa') { |_, other_digest| nilsimsa_compare_value(digest, other_digest) >= NILSIMSA_COMPARE_THRESHOLD } | |||
else | |||
any_other_digest?('md5') { |_, other_digest| other_digest == digest } | |||
digests_over_threshold?('md5') { |_, other_digest| other_digest == digest } | |||
end | |||
end | |||
@@ -38,7 +54,7 @@ class SpamCheck | |||
# get the correct status ID back, we have to save it in the string value | |||
redis.zadd(redis_key, @status.id, digest_with_algorithm) | |||
redis.zremrangebyrank(redis_key, '0', '-10') | |||
redis.zremrangebyrank(redis_key, 0, -(MAX_TRAIL_SIZE + 1)) | |||
redis.expire(redis_key, EXPIRE_SET_AFTER) | |||
end | |||
@@ -78,6 +94,20 @@ class SpamCheck | |||
end | |||
end | |||
class << self | |||
def perform(status) | |||
spam_check = new(status) | |||
return if spam_check.skip? | |||
if spam_check.spam? | |||
spam_check.flag! | |||
else | |||
spam_check.remember! | |||
end | |||
end | |||
end | |||
private | |||
def disabled? | |||
@@ -149,14 +179,14 @@ class SpamCheck | |||
redis.zrange(redis_key, 0, -1) | |||
end | |||
def any_other_digest?(filter_algorithm) | |||
other_digests.any? do |record| | |||
def digests_over_threshold?(filter_algorithm) | |||
other_digests.select do |record| | |||
algorithm, other_digest, status_id = record.split(':') | |||
next unless algorithm == filter_algorithm | |||
yield algorithm, other_digest, status_id | |||
end | |||
end.size >= THRESHOLD | |||
end | |||
def matching_status_ids | |||
@@ -33,6 +33,7 @@ class ProcessMentionsService < BaseService | |||
end | |||
status.save! | |||
check_for_spam(status) | |||
mentions.each { |mention| create_notification(mention) } | |||
end | |||
@@ -61,4 +62,8 @@ class ProcessMentionsService < BaseService | |||
def resolve_account_service | |||
ResolveAccountService.new | |||
end | |||
def check_for_spam(status) | |||
SpamCheck.perform(status) | |||
end | |||
end |
@@ -86,23 +86,33 @@ RSpec.describe SpamCheck do | |||
end | |||
it 'returns true for duplicate statuses to the same recipient' do | |||
status1 = status_with_html('@alice Hello') | |||
described_class.new(status1).remember! | |||
described_class::THRESHOLD.times do | |||
status1 = status_with_html('@alice Hello') | |||
described_class.new(status1).remember! | |||
end | |||
status2 = status_with_html('@alice Hello') | |||
expect(described_class.new(status2).spam?).to be true | |||
end | |||
it 'returns true for duplicate statuses to different recipients' do | |||
status1 = status_with_html('@alice Hello') | |||
described_class.new(status1).remember! | |||
described_class::THRESHOLD.times do | |||
status1 = status_with_html('@alice Hello') | |||
described_class.new(status1).remember! | |||
end | |||
status2 = status_with_html('@bob Hello') | |||
expect(described_class.new(status2).spam?).to be true | |||
end | |||
it 'returns true for nearly identical statuses with random numbers' do | |||
source_text = 'Sodium, atomic number 11, was first isolated by Humphry Davy in 1807. A chemical component of salt, he named it Na in honor of the saltiest region on earth, North America.' | |||
status1 = status_with_html('@alice ' + source_text + ' 1234') | |||
described_class.new(status1).remember! | |||
described_class::THRESHOLD.times do | |||
status1 = status_with_html('@alice ' + source_text + ' 1234') | |||
described_class.new(status1).remember! | |||
end | |||
status2 = status_with_html('@bob ' + source_text + ' 9568') | |||
expect(described_class.new(status2).spam?).to be true | |||
end | |||
@@ -140,9 +150,9 @@ RSpec.describe SpamCheck do | |||
let(:redis_key) { spam_check.send(:redis_key) } | |||
it 'remembers' do | |||
expect do | |||
spam_check.remember! | |||
end.to change { Redis.current.exists(redis_key) }.from(false).to(true) | |||
expect(Redis.current.exists(redis_key)).to be true | |||
spam_check.remember! | |||
expect(Redis.current.exists(redis_key)).to be true | |||
end | |||
end | |||
@@ -156,9 +166,9 @@ RSpec.describe SpamCheck do | |||
end | |||
it 'resets' do | |||
expect do | |||
spam_check.reset! | |||
end.to change { Redis.current.exists(redis_key) }.from(true).to(false) | |||
expect(Redis.current.exists(redis_key)).to be true | |||
spam_check.reset! | |||
expect(Redis.current.exists(redis_key)).to be false | |||
end | |||
end | |||