Browse Source

Add more accurate account search (#11537)

* Add more accurate account search

When ElasticSearch is available, a more accurate search is implemented:

- Using edge n-gram index for acct and display name
- Using asciifolding and cjk width normalization on display names
- Using Gaussian decay on account activity for additional scoring (recency)
- Using followers/friends ratio for additional scoring (spamminess)
- Using followers number for additional scoring (size)

The exact match precedence only takes effect when the input conforms
to the username format and the username part of it is complete, i.e.
when the user started typing the domain part.

* Support single-letter usernames

* Fix tests

* Fix not picking up account updates

* Add weights and normalization for scores, skip zero terms queries

* Use local counts for accounts index, adjust search parameters

* Fix mistakes

* Using updated_at of accounts is inadequate for remote accounts
master^2
Eugen Rochko 4 years ago
committed by GitHub
parent
commit
8fdff2748f
No known key found for this signature in database GPG Key ID: 4AEE18F83AFDEB23
5 changed files with 194 additions and 161 deletions
  1. +36
    -0
      app/chewy/accounts_index.rb
  2. +6
    -0
      app/models/account.rb
  3. +2
    -0
      app/models/account_stat.rb
  4. +125
    -64
      app/services/account_search_service.rb
  5. +25
    -97
      spec/services/account_search_service_spec.rb

+ 36
- 0
app/chewy/accounts_index.rb View File

@@ -0,0 +1,36 @@
# frozen_string_literal: true

class AccountsIndex < Chewy::Index
settings index: { refresh_interval: '5m' }, analysis: {
analyzer: {
content: {
tokenizer: 'whitespace',
filter: %w(lowercase asciifolding cjk_width),
},

edge_ngram: {
tokenizer: 'edge_ngram',
filter: %w(lowercase asciifolding cjk_width),
},
},

tokenizer: {
edge_ngram: {
type: 'edge_ngram',
min_gram: 1,
max_gram: 15,
},
},
}

define_type ::Account.searchable.includes(:account_stat), delete_if: ->(account) { account.destroyed? || !account.searchable? } do
root date_detection: false do
field :id, type: 'long'
field :display_name, type: 'text', analyzer: 'edge_ngram', search_analyzer: 'content'
field :acct, type: 'text', analyzer: 'edge_ngram', search_analyzer: 'content', value: ->(account) { [account.username, account.domain].compact.join('@') }
field :following_count, type: 'long', value: ->(account) { account.active_relationships.count }
field :followers_count, type: 'long', value: ->(account) { account.passive_relationships.count }
field :last_status_at, type: 'date', value: ->(account) { account.last_status_at || account.created_at }
end
end
end

+ 6
- 0
app/models/account.rb View File

@@ -127,6 +127,8 @@ class Account < ApplicationRecord

delegate :chosen_languages, to: :user, prefix: false, allow_nil: true

update_index('accounts#account', :self) if Chewy.enabled?

def local?
domain.nil?
end
@@ -169,6 +171,10 @@ class Account < ApplicationRecord
subscription_expires_at.present?
end

def searchable?
!(suspended? || moved?)
end

def possibly_stale?
last_webfingered_at.nil? || last_webfingered_at <= 1.day.ago
end


+ 2
- 0
app/models/account_stat.rb View File

@@ -16,6 +16,8 @@
class AccountStat < ApplicationRecord
belongs_to :account, inverse_of: :account_stat

update_index('accounts#account', :account) if Chewy.enabled?

def increment_count!(key)
update(attributes_for_increment(key))
end


+ 125
- 64
app/services/account_search_service.rb View File

@@ -4,105 +4,134 @@ class AccountSearchService < BaseService
attr_reader :query, :limit, :offset, :options, :account

def call(query, account = nil, options = {})
@query = query.strip
@limit = options[:limit].to_i
@offset = options[:offset].to_i
@options = options
@account = account
@acct_hint = query.start_with?('@')
@query = query.strip.gsub(/\A@/, '')
@limit = options[:limit].to_i
@offset = options[:offset].to_i
@options = options
@account = account

search_service_results
search_service_results.compact.uniq
end

private

def search_service_results
return [] if query_blank_or_hashtag? || limit < 1
return [] if query.blank? || limit < 1

if resolving_non_matching_remote_account?
[ResolveAccountService.new.call("#{query_username}@#{query_domain}")].compact
else
search_results_and_exact_match.compact.uniq
end
[exact_match] + search_results
end

def resolving_non_matching_remote_account?
offset.zero? && options[:resolve] && !exact_match? && !domain_is_local?
end
def exact_match
return unless offset.zero? && username_complete?

def search_results_and_exact_match
return search_results.to_a unless offset.zero?
return @exact_match if defined?(@exact_match)

results = [exact_match]
@exact_match = begin
if options[:resolve]
ResolveAccountService.new.call(query)
elsif domain_is_local?
Account.find_local(query_username)
else
Account.find_remote(query_username, query_domain)
end
end
end

return results if exact_match? && limit == 1
def search_results
return [] if limit_for_non_exact_results.zero?

results + search_results.to_a
@search_results ||= begin
if Chewy.enabled?
from_elasticsearch
else
from_database
end
end
end

def query_blank_or_hashtag?
query.blank? || query.start_with?('#')
def from_database
if account
advanced_search_results
else
simple_search_results
end
end

def split_query_string
@split_query_string ||= query.gsub(/\A@/, '').split('@')
def advanced_search_results
Account.advanced_search_for(terms_for_query, account, limit_for_non_exact_results, options[:following], offset)
end

def query_username
@query_username ||= split_query_string.first || ''
def simple_search_results
Account.search_for(terms_for_query, limit_for_non_exact_results, offset)
end

def query_domain
@query_domain ||= query_without_split? ? nil : split_query_string.last
end
def from_elasticsearch
must_clauses = [{ multi_match: { query: terms_for_query, fields: likely_acct? ? %w(acct) : %w(acct^2 display_name), type: 'best_fields' } }]
should_clauses = []

def query_without_split?
split_query_string.size == 1
end
if account
return [] if options[:following] && following_ids.empty?

def domain_is_local?
@domain_is_local ||= TagManager.instance.local_domain?(query_domain)
end
if options[:following]
must_clauses << { terms: { id: following_ids } }
elsif following_ids.any?
should_clauses << { terms: { id: following_ids, boost: 100 } }
end
end

def search_from
options[:following] && account ? account.following : Account
end
query = { bool: { must: must_clauses, should: should_clauses } }
functions = [reputation_score_function, followers_score_function, time_distance_function]

def exact_match?
exact_match.present?
end
records = AccountsIndex.query(function_score: { query: query, functions: functions, boost_mode: 'multiply', score_mode: 'avg' })
.limit(limit_for_non_exact_results)
.offset(offset)
.objects
.compact

def exact_match
return @exact_match if defined?(@exact_match)
ActiveRecord::Associations::Preloader.new.preload(records, :account_stat)

@exact_match = begin
if domain_is_local?
search_from.without_suspended.find_local(query_username)
else
search_from.without_suspended.find_remote(query_username, query_domain)
end
end
records
end

def search_results
@search_results ||= begin
if account
advanced_search_results
else
simple_search_results
end
end
def reputation_score_function
{
script_score: {
script: {
source: "(doc['followers_count'].value + 0.0) / (doc['followers_count'].value + doc['following_count'].value + 1)",
},
},
}
end

def advanced_search_results
Account.advanced_search_for(terms_for_query, account, limit_for_non_exact_results, options[:following], offset)
def followers_score_function
{
field_value_factor: {
field: 'followers_count',
modifier: 'log2p',
missing: 1,
},
}
end

def simple_search_results
Account.search_for(terms_for_query, limit_for_non_exact_results, offset)
def time_distance_function
{
gauss: {
last_status_at: {
scale: '30d',
offset: '30d',
decay: 0.3,
},
},
}
end

def following_ids
@following_ids ||= account.active_relationships.pluck(:target_account_id)
end

def limit_for_non_exact_results
if offset.zero? && exact_match?
if exact_match?
limit - 1
else
limit
@@ -113,7 +142,39 @@ class AccountSearchService < BaseService
if domain_is_local?
query_username
else
"#{query_username} #{query_domain}"
query
end
end

def split_query_string
@split_query_string ||= query.split('@')
end

def query_username
@query_username ||= split_query_string.first || ''
end

def query_domain
@query_domain ||= query_without_split? ? nil : split_query_string.last
end

def query_without_split?
split_query_string.size == 1
end

def domain_is_local?
@domain_is_local ||= TagManager.instance.local_domain?(query_domain)
end

def exact_match?
exact_match.present?
end

def username_complete?
query.include?('@') && "@#{query}" =~ Account::MENTION_RE
end

def likely_acct?
@acct_hint || username_complete?
end
end

+ 25
- 97
spec/services/account_search_service_spec.rb View File

@@ -1,126 +1,56 @@
require 'rails_helper'

describe AccountSearchService, type: :service do
describe '.call' do
describe 'with a query to ignore' do
describe '#call' do
context 'with a query to ignore' do
it 'returns empty array for missing query' do
results = subject.call('', nil, limit: 10)

expect(results).to eq []
end
it 'returns empty array for hashtag query' do
results = subject.call('#tag', nil, limit: 10)

expect(results).to eq []
end
it 'returns empty array for limit zero' do
Fabricate(:account, username: 'match')

results = subject.call('match', nil, limit: 0)

expect(results).to eq []
end
end

describe 'searching for a simple term that is not an exact match' do
context 'searching for a simple term that is not an exact match' do
it 'does not return a nil entry in the array for the exact match' do
match = Fabricate(:account, username: 'matchingusername')

account = Fabricate(:account, username: 'matchingusername')
results = subject.call('match', nil, limit: 5)
expect(results).to eq [match]
end
end

describe 'searching local and remote users' do
describe "when only '@'" do
before do
allow(Account).to receive(:find_local)
allow(Account).to receive(:search_for)
subject.call('@', nil, limit: 10)
end

it 'uses find_local with empty query to look for local accounts' do
expect(Account).to have_received(:find_local).with('')
end
end

describe 'when no domain' do
before do
allow(Account).to receive(:find_local)
allow(Account).to receive(:search_for)
subject.call('one', nil, limit: 10)
end

it 'uses find_local to look for local accounts' do
expect(Account).to have_received(:find_local).with('one')
end

it 'uses search_for to find matches' do
expect(Account).to have_received(:search_for).with('one', 10, 0)
end
end

describe 'when there is a domain' do
before do
allow(Account).to receive(:find_remote)
end

it 'uses find_remote to look for remote accounts' do
subject.call('two@example.com', nil, limit: 10)
expect(Account).to have_received(:find_remote).with('two', 'example.com')
end

describe 'and there is no account provided' do
it 'uses search_for to find matches' do
allow(Account).to receive(:search_for)
subject.call('two@example.com', nil, limit: 10, resolve: false)

expect(Account).to have_received(:search_for).with('two example.com', 10, 0)
end
end

describe 'and there is an account provided' do
it 'uses advanced_search_for to find matches' do
account = Fabricate(:account)
allow(Account).to receive(:advanced_search_for)
subject.call('two@example.com', account, limit: 10, resolve: false)

expect(Account).to have_received(:advanced_search_for).with('two example.com', account, 10, nil, 0)
end
end
expect(results).to eq [account]
end
end

describe 'with an exact match' do
it 'returns exact match first, and does not return duplicates' do
partial = Fabricate(:account, username: 'exactness')
exact = Fabricate(:account, username: 'exact')

results = subject.call('exact', nil, limit: 10)
expect(results.size).to eq 2
expect(results).to eq [exact, partial]
end
end

describe 'when there is a local domain' do
context 'when there is a local domain' do
around do |example|
before = Rails.configuration.x.local_domain

example.run

Rails.configuration.x.local_domain = before
end

it 'returns exact match first' do
remote = Fabricate(:account, username: 'a', domain: 'remote', display_name: 'e')
remote_too = Fabricate(:account, username: 'b', domain: 'remote', display_name: 'e')
exact = Fabricate(:account, username: 'e')
exact = Fabricate(:account, username: 'e')

Rails.configuration.x.local_domain = 'example.com'

results = subject.call('e@example.com', nil, limit: 2)

expect(results.size).to eq 2
expect(results).to eq([exact, remote]).or eq([exact, remote_too])
end
end

describe 'when there is a domain but no exact match' do
context 'when there is a domain but no exact match' do
it 'follows the remote account when resolve is true' do
service = double(call: nil)
allow(ResolveAccountService).to receive(:new).and_return(service)
@@ -138,23 +68,21 @@ describe AccountSearchService, type: :service do
end
end

describe 'should not include suspended accounts' do
it 'returns the fuzzy match first, and does not return suspended exacts' do
partial = Fabricate(:account, username: 'exactness')
exact = Fabricate(:account, username: 'exact', suspended: true)
it 'returns the fuzzy match first, and does not return suspended exacts' do
partial = Fabricate(:account, username: 'exactness')
exact = Fabricate(:account, username: 'exact', suspended: true)
results = subject.call('exact', nil, limit: 10)

results = subject.call('exact', nil, limit: 10)
expect(results.size).to eq 1
expect(results).to eq [partial]
end
expect(results.size).to eq 1
expect(results).to eq [partial]
end

it "does not return suspended remote accounts" do
remote = Fabricate(:account, username: 'a', domain: 'remote', display_name: 'e', suspended: true)
it "does not return suspended remote accounts" do
remote = Fabricate(:account, username: 'a', domain: 'remote', display_name: 'e', suspended: true)
results = subject.call('a@example.com', nil, limit: 2)

results = subject.call('a@example.com', nil, limit: 2)
expect(results.size).to eq 0
expect(results).to eq []
end
expect(results.size).to eq 0
expect(results).to eq []
end
end
end

Loading…
Cancel
Save