Search: Implement exact matching. (#6895)

* Search: Implement exact matching.

Fixes #6837

This re-uses the existing `name_sort` field that is a not analyzed
version of `name` which is needed for exact matches.

* Fix flake8

* Add another test for exact matching

* Add test that tests for description hijack

* Use new name.raw field

* Remove l10n test for now, it only works accidentally.

* Fix comment

* Fix tests
This commit is contained in:
Christopher Grebs 2017-11-13 18:54:02 +01:00 коммит произвёл GitHub
Родитель 0af406cbdc
Коммит b8749341b3
Не найден ключ, соответствующий данной подписи
Идентификатор ключа GPG: 4AEE18F83AFDEB23
5 изменённых файлов: 96 добавлений и 19 удалений

Просмотреть файл

@ -22,7 +22,8 @@ class AddonIndexer(BaseSearchIndexer):
"""Fields we don't need to expose in the results, only used for filtering """Fields we don't need to expose in the results, only used for filtering
or sorting.""" or sorting."""
hidden_fields = ( hidden_fields = (
'name_sort', '*.raw',
'*_sort',
'boost', 'boost',
'hotness', 'hotness',
# Translated content that is used for filtering purposes is stored # Translated content that is used for filtering purposes is stored
@ -140,11 +141,17 @@ class AddonIndexer(BaseSearchIndexer):
}, },
}, },
'modified': {'type': 'date', 'index': False}, 'modified': {'type': 'date', 'index': False},
# Adding word-delimiter to split on camelcase and 'name': {
# punctuation. 'type': 'text',
'name': {'type': 'text', # Adding word-delimiter to split on camelcase and
'analyzer': 'standardPlusWordDelimiter'}, # punctuation.
# Turn off analysis on name so we can sort by it. 'analyzer': 'standardPlusWordDelimiter',
'fields': {
# Turn off analysis on name so we can sort by it.
'raw': {'type': 'keyword'}
},
},
# TODO: Can be removed once we have `name.raw` indexed
'name_sort': {'type': 'keyword'}, 'name_sort': {'type': 'keyword'},
'persona': { 'persona': {
'type': 'object', 'type': 'object',

Просмотреть файл

@ -2218,7 +2218,7 @@ class TestAddonSearchView(ESTestCase):
qset = AddonSearchView().get_queryset() qset = AddonSearchView().get_queryset()
assert set(qset.to_dict()['_source']['excludes']) == set( assert set(qset.to_dict()['_source']['excludes']) == set(
('name_sort', 'boost', 'hotness', 'name', 'description', ('*.raw', '*_sort', 'boost', 'hotness', 'name', 'description',
'name_l10n_*', 'description_l10n_*', 'summary', 'summary_l10n_*') 'name_l10n_*', 'description_l10n_*', 'summary', 'summary_l10n_*')
) )

Просмотреть файл

@ -272,9 +272,20 @@ class SearchQueryFilter(BaseFilterBackend):
# Apply rules to search on few base fields. Some might not be present # Apply rules to search on few base fields. Some might not be present
# in every document type / indexes. # in every document type / indexes.
for k, v in rules: for query_cls, opts in rules:
for field in ('name', 'slug', 'listed_authors.name'): for field in ('name', 'slug', 'listed_authors.name'):
should.append(k(**{field: v})) should.append(query_cls(**{field: opts}))
# Exact matches need to be queried against a non-analyzed field. Let's
# do a term query on `name.raw` for an exact match against the add-on
# name and boost it since this is likely what the user wants.
# Use a super-high boost to avoid `description` or `summary`
# getting in our way.
should.append(query.Term(**{
'name.raw': {
'value': search_query, 'boost': 100
}
}))
# For name, also search in translated field with the right language # For name, also search in translated field with the right language
# and analyzer. # and analyzer.

Просмотреть файл

@ -134,6 +134,20 @@ class TestQueryFilter(FilterTestsBase):
]}} ]}}
} }
def test_q_exact(self):
qs = self._filter(data={'q': 'Adblock Plus'})
should = qs['query']['function_score']['query']['bool']['should']
expected = {
'term': {
'name.raw': {
'boost': 100, 'value': u'adblock plus',
}
}
}
assert expected in should
class TestReviewedContentFilter(FilterTestsBase): class TestReviewedContentFilter(FilterTestsBase):

Просмотреть файл

@ -727,29 +727,29 @@ class TestSearchResultScoring(ESTestCase):
self.refresh() self.refresh()
response = self.client.get(self.url, {'q': 'merge windows'}) response = self.client.get(self.url, {'q': 'merge windows'})
result = self.get_results(response) results = self.get_results(response)
# Doesn't match "All Downloader Professional" # Doesn't match "All Downloader Professional"
assert addons[2].pk not in result assert addons[2].pk not in results
# Matches both "Merge Windows" and "Merge All Windows" but can't # Matches both "Merge Windows" and "Merge All Windows" but can't
# correctly predict their exact scoring since we don't have # correctly predict their exact scoring since we don't have
# an exact match that would prefer 'merge windows'. Both should be # an exact match that would prefer 'merge windows'. Both should be
# the first two matches though. # the first two matches though.
assert addons[1].pk in result[:2] assert addons[1].pk in results[:2]
assert addons[0].pk in result[:2] assert addons[0].pk in results[:2]
response = self.client.get(self.url, {'q': 'merge all windows'}) response = self.client.get(self.url, {'q': 'merge all windows'})
result = self.get_results(response) results = self.get_results(response)
# Make sure we match 'All Downloader Professional' but it's # Make sure we match 'All Downloader Professional' but it's
# term match frequency is much lower than the other two so it's # term match frequency is much lower than the other two so it's
# last. # last.
assert addons[2].pk == result[-1] assert addons[2].pk == results[-1]
# Other two are first rank again. # Other two are first rank again.
assert addons[1].pk in result[:2] assert addons[1].pk in results[:2]
assert addons[0].pk in result[:2] assert addons[0].pk in results[:2]
def test_score_boost_name_match_slop(self): def test_score_boost_name_match_slop(self):
addon = amo.tests.addon_factory( addon = amo.tests.addon_factory(
@ -760,9 +760,54 @@ class TestSearchResultScoring(ESTestCase):
# direct match # direct match
response = self.client.get(self.url, {'q': 'merge windows'}) response = self.client.get(self.url, {'q': 'merge windows'})
result = self.get_results(response) results = self.get_results(response)
assert result[0] == addon.pk assert results[0] == addon.pk
def test_score_boost_exact_match(self):
"""Test that we rank exact matches at the top."""
addons = [
amo.tests.addon_factory(
name='test addon test11', type=amo.ADDON_EXTENSION,
average_daily_users=0, weekly_downloads=0),
amo.tests.addon_factory(
name='test addon test21', type=amo.ADDON_EXTENSION,
average_daily_users=0, weekly_downloads=0),
amo.tests.addon_factory(
name='test addon test31', type=amo.ADDON_EXTENSION,
average_daily_users=0, weekly_downloads=0),
]
self.refresh()
response = self.client.get(self.url, {'q': 'test addon test21'})
results = self.get_results(response)
assert results[0] == addons[1].pk
def test_score_boost_exact_match_description_hijack(self):
"""Test that we rank exact matches at the top."""
addons = [
amo.tests.addon_factory(
name='1-Click YouTube Video Download',
type=amo.ADDON_EXTENSION,
average_daily_users=566337, weekly_downloads=150000,
description=(
'button, click that button, 1-Click Youtube Video '
'Downloader is a click click great tool')),
amo.tests.addon_factory(
name='Amazon 1-Click Lock', type=amo.ADDON_EXTENSION,
average_daily_users=50, weekly_downloads=0),
]
self.refresh()
response = self.client.get(self.url, {
'q': 'Amazon 1-Click Lock'
})
results = self.get_results(response)
assert results[0] == addons[1].pk
class TestPersonaSearch(SearchBase): class TestPersonaSearch(SearchBase):