Instead of detecting spam on first duplicate message, add a threshold of 5 such messages to reduce false positivesmaster^2
@@ -408,15 +408,7 @@ class ActivityPub::Activity::Create < ActivityPub::Activity | |||||
end | end | ||||
def check_for_spam | def check_for_spam | ||||
spam_check = SpamCheck.new(@status) | |||||
return if spam_check.skip? | |||||
if spam_check.spam? | |||||
spam_check.flag! | |||||
else | |||||
spam_check.remember! | |||||
end | |||||
SpamCheck.perform(@status) | |||||
end | end | ||||
def forward_for_reply | def forward_for_reply | ||||
@@ -4,9 +4,25 @@ class SpamCheck | |||||
include Redisable | include Redisable | ||||
include ActionView::Helpers::TextHelper | include ActionView::Helpers::TextHelper | ||||
# Threshold over which two Nilsimsa values are considered | |||||
# to refer to the same text | |||||
NILSIMSA_COMPARE_THRESHOLD = 95 | NILSIMSA_COMPARE_THRESHOLD = 95 | ||||
NILSIMSA_MIN_SIZE = 10 | |||||
EXPIRE_SET_AFTER = 1.week.seconds | |||||
# Nilsimsa doesn't work well on small inputs, so below | |||||
# this size, we check only for exact matches with MD5 | |||||
NILSIMSA_MIN_SIZE = 10 | |||||
# How long to keep the trail of digests between updates, | |||||
# there is no reason to store it forever | |||||
EXPIRE_SET_AFTER = 1.week.seconds | |||||
# How many digests to keep in an account's trail. If it's | |||||
# too small, spam could rotate around different message templates | |||||
MAX_TRAIL_SIZE = 10 | |||||
# How many detected duplicates to allow through before | |||||
# considering the message as spam | |||||
THRESHOLD = 5 | |||||
def initialize(status) | def initialize(status) | ||||
@account = status.account | @account = status.account | ||||
@@ -21,9 +37,9 @@ class SpamCheck | |||||
if insufficient_data? | if insufficient_data? | ||||
false | false | ||||
elsif nilsimsa? | elsif nilsimsa? | ||||
any_other_digest?('nilsimsa') { |_, other_digest| nilsimsa_compare_value(digest, other_digest) >= NILSIMSA_COMPARE_THRESHOLD } | |||||
digests_over_threshold?('nilsimsa') { |_, other_digest| nilsimsa_compare_value(digest, other_digest) >= NILSIMSA_COMPARE_THRESHOLD } | |||||
else | else | ||||
any_other_digest?('md5') { |_, other_digest| other_digest == digest } | |||||
digests_over_threshold?('md5') { |_, other_digest| other_digest == digest } | |||||
end | end | ||||
end | end | ||||
@@ -38,7 +54,7 @@ class SpamCheck | |||||
# get the correct status ID back, we have to save it in the string value | # get the correct status ID back, we have to save it in the string value | ||||
redis.zadd(redis_key, @status.id, digest_with_algorithm) | redis.zadd(redis_key, @status.id, digest_with_algorithm) | ||||
redis.zremrangebyrank(redis_key, '0', '-10') | |||||
redis.zremrangebyrank(redis_key, 0, -(MAX_TRAIL_SIZE + 1)) | |||||
redis.expire(redis_key, EXPIRE_SET_AFTER) | redis.expire(redis_key, EXPIRE_SET_AFTER) | ||||
end | end | ||||
@@ -78,6 +94,20 @@ class SpamCheck | |||||
end | end | ||||
end | end | ||||
class << self | |||||
def perform(status) | |||||
spam_check = new(status) | |||||
return if spam_check.skip? | |||||
if spam_check.spam? | |||||
spam_check.flag! | |||||
else | |||||
spam_check.remember! | |||||
end | |||||
end | |||||
end | |||||
private | private | ||||
def disabled? | def disabled? | ||||
@@ -149,14 +179,14 @@ class SpamCheck | |||||
redis.zrange(redis_key, 0, -1) | redis.zrange(redis_key, 0, -1) | ||||
end | end | ||||
def any_other_digest?(filter_algorithm) | |||||
other_digests.any? do |record| | |||||
def digests_over_threshold?(filter_algorithm) | |||||
other_digests.select do |record| | |||||
algorithm, other_digest, status_id = record.split(':') | algorithm, other_digest, status_id = record.split(':') | ||||
next unless algorithm == filter_algorithm | next unless algorithm == filter_algorithm | ||||
yield algorithm, other_digest, status_id | yield algorithm, other_digest, status_id | ||||
end | |||||
end.size >= THRESHOLD | |||||
end | end | ||||
def matching_status_ids | def matching_status_ids | ||||
@@ -33,6 +33,7 @@ class ProcessMentionsService < BaseService | |||||
end | end | ||||
status.save! | status.save! | ||||
check_for_spam(status) | |||||
mentions.each { |mention| create_notification(mention) } | mentions.each { |mention| create_notification(mention) } | ||||
end | end | ||||
@@ -61,4 +62,8 @@ class ProcessMentionsService < BaseService | |||||
def resolve_account_service | def resolve_account_service | ||||
ResolveAccountService.new | ResolveAccountService.new | ||||
end | end | ||||
def check_for_spam(status) | |||||
SpamCheck.perform(status) | |||||
end | |||||
end | end |
@@ -86,23 +86,33 @@ RSpec.describe SpamCheck do | |||||
end | end | ||||
it 'returns true for duplicate statuses to the same recipient' do | it 'returns true for duplicate statuses to the same recipient' do | ||||
status1 = status_with_html('@alice Hello') | |||||
described_class.new(status1).remember! | |||||
described_class::THRESHOLD.times do | |||||
status1 = status_with_html('@alice Hello') | |||||
described_class.new(status1).remember! | |||||
end | |||||
status2 = status_with_html('@alice Hello') | status2 = status_with_html('@alice Hello') | ||||
expect(described_class.new(status2).spam?).to be true | expect(described_class.new(status2).spam?).to be true | ||||
end | end | ||||
it 'returns true for duplicate statuses to different recipients' do | it 'returns true for duplicate statuses to different recipients' do | ||||
status1 = status_with_html('@alice Hello') | |||||
described_class.new(status1).remember! | |||||
described_class::THRESHOLD.times do | |||||
status1 = status_with_html('@alice Hello') | |||||
described_class.new(status1).remember! | |||||
end | |||||
status2 = status_with_html('@bob Hello') | status2 = status_with_html('@bob Hello') | ||||
expect(described_class.new(status2).spam?).to be true | expect(described_class.new(status2).spam?).to be true | ||||
end | end | ||||
it 'returns true for nearly identical statuses with random numbers' do | it 'returns true for nearly identical statuses with random numbers' do | ||||
source_text = 'Sodium, atomic number 11, was first isolated by Humphry Davy in 1807. A chemical component of salt, he named it Na in honor of the saltiest region on earth, North America.' | source_text = 'Sodium, atomic number 11, was first isolated by Humphry Davy in 1807. A chemical component of salt, he named it Na in honor of the saltiest region on earth, North America.' | ||||
status1 = status_with_html('@alice ' + source_text + ' 1234') | |||||
described_class.new(status1).remember! | |||||
described_class::THRESHOLD.times do | |||||
status1 = status_with_html('@alice ' + source_text + ' 1234') | |||||
described_class.new(status1).remember! | |||||
end | |||||
status2 = status_with_html('@bob ' + source_text + ' 9568') | status2 = status_with_html('@bob ' + source_text + ' 9568') | ||||
expect(described_class.new(status2).spam?).to be true | expect(described_class.new(status2).spam?).to be true | ||||
end | end | ||||
@@ -140,9 +150,9 @@ RSpec.describe SpamCheck do | |||||
let(:redis_key) { spam_check.send(:redis_key) } | let(:redis_key) { spam_check.send(:redis_key) } | ||||
it 'remembers' do | it 'remembers' do | ||||
expect do | |||||
spam_check.remember! | |||||
end.to change { Redis.current.exists(redis_key) }.from(false).to(true) | |||||
expect(Redis.current.exists(redis_key)).to be true | |||||
spam_check.remember! | |||||
expect(Redis.current.exists(redis_key)).to be true | |||||
end | end | ||||
end | end | ||||
@@ -156,9 +166,9 @@ RSpec.describe SpamCheck do | |||||
end | end | ||||
it 'resets' do | it 'resets' do | ||||
expect do | |||||
spam_check.reset! | |||||
end.to change { Redis.current.exists(redis_key) }.from(true).to(false) | |||||
expect(Redis.current.exists(redis_key)).to be true | |||||
spam_check.reset! | |||||
expect(Redis.current.exists(redis_key)).to be false | |||||
end | end | ||||
end | end | ||||