From b8749341b3bb09b958908d025b35c4df982ff50c Mon Sep 17 00:00:00 2001 From: Christopher Grebs Date: Mon, 13 Nov 2017 18:54:02 +0100 Subject: [PATCH] Search: Implement exact matching. (#6895) * Search: Implement exact matching. Fixes #6837 This re-uses the existing `name_sort` field that is a not analyzed version of `name` which is needed for exact matches. * Fix flake8 * Add another test for exact matching * Add test that tests for description hijack * Use new name.raw field * Remove l10n test for now, it only works accidentally. * Fix comment * Fix tests --- src/olympia/addons/indexers.py | 19 ++++--- src/olympia/addons/tests/test_views.py | 2 +- src/olympia/search/filters.py | 15 +++++- src/olympia/search/tests/test_filters.py | 14 +++++ src/olympia/search/tests/test_views.py | 65 ++++++++++++++++++++---- 5 files changed, 96 insertions(+), 19 deletions(-) diff --git a/src/olympia/addons/indexers.py b/src/olympia/addons/indexers.py index 645c7bdce0..744123f468 100644 --- a/src/olympia/addons/indexers.py +++ b/src/olympia/addons/indexers.py @@ -22,7 +22,8 @@ class AddonIndexer(BaseSearchIndexer): """Fields we don't need to expose in the results, only used for filtering or sorting.""" hidden_fields = ( - 'name_sort', + '*.raw', + '*_sort', 'boost', 'hotness', # Translated content that is used for filtering purposes is stored @@ -140,11 +141,17 @@ class AddonIndexer(BaseSearchIndexer): }, }, 'modified': {'type': 'date', 'index': False}, - # Adding word-delimiter to split on camelcase and - # punctuation. - 'name': {'type': 'text', - 'analyzer': 'standardPlusWordDelimiter'}, - # Turn off analysis on name so we can sort by it. + 'name': { + 'type': 'text', + # Adding word-delimiter to split on camelcase and + # punctuation. + 'analyzer': 'standardPlusWordDelimiter', + 'fields': { + # Turn off analysis on name so we can sort by it. + 'raw': {'type': 'keyword'} + }, + }, + # TODO: Can be removed once we have `name.raw` indexed 'name_sort': {'type': 'keyword'}, 'persona': { 'type': 'object', diff --git a/src/olympia/addons/tests/test_views.py b/src/olympia/addons/tests/test_views.py index 76f75b34e7..f872f90d61 100644 --- a/src/olympia/addons/tests/test_views.py +++ b/src/olympia/addons/tests/test_views.py @@ -2218,7 +2218,7 @@ class TestAddonSearchView(ESTestCase): qset = AddonSearchView().get_queryset() assert set(qset.to_dict()['_source']['excludes']) == set( - ('name_sort', 'boost', 'hotness', 'name', 'description', + ('*.raw', '*_sort', 'boost', 'hotness', 'name', 'description', 'name_l10n_*', 'description_l10n_*', 'summary', 'summary_l10n_*') ) diff --git a/src/olympia/search/filters.py b/src/olympia/search/filters.py index 7e2ed343e0..20ff7019e4 100644 --- a/src/olympia/search/filters.py +++ b/src/olympia/search/filters.py @@ -272,9 +272,20 @@ class SearchQueryFilter(BaseFilterBackend): # Apply rules to search on few base fields. Some might not be present # in every document type / indexes. - for k, v in rules: + for query_cls, opts in rules: for field in ('name', 'slug', 'listed_authors.name'): - should.append(k(**{field: v})) + should.append(query_cls(**{field: opts})) + + # Exact matches need to be queried against a non-analyzed field. Let's + # do a term query on `name.raw` for an exact match against the add-on + # name and boost it since this is likely what the user wants. + # Use a super-high boost to avoid `description` or `summary` + # getting in our way. + should.append(query.Term(**{ + 'name.raw': { + 'value': search_query, 'boost': 100 + } + })) # For name, also search in translated field with the right language # and analyzer. diff --git a/src/olympia/search/tests/test_filters.py b/src/olympia/search/tests/test_filters.py index abf0a5d4bc..40d4c57ec7 100644 --- a/src/olympia/search/tests/test_filters.py +++ b/src/olympia/search/tests/test_filters.py @@ -134,6 +134,20 @@ class TestQueryFilter(FilterTestsBase): ]}} } + def test_q_exact(self): + qs = self._filter(data={'q': 'Adblock Plus'}) + should = qs['query']['function_score']['query']['bool']['should'] + + expected = { + 'term': { + 'name.raw': { + 'boost': 100, 'value': u'adblock plus', + } + } + } + + assert expected in should + class TestReviewedContentFilter(FilterTestsBase): diff --git a/src/olympia/search/tests/test_views.py b/src/olympia/search/tests/test_views.py index 40bfd1328c..d1648514de 100644 --- a/src/olympia/search/tests/test_views.py +++ b/src/olympia/search/tests/test_views.py @@ -727,29 +727,29 @@ class TestSearchResultScoring(ESTestCase): self.refresh() response = self.client.get(self.url, {'q': 'merge windows'}) - result = self.get_results(response) + results = self.get_results(response) # Doesn't match "All Downloader Professional" - assert addons[2].pk not in result + assert addons[2].pk not in results # Matches both "Merge Windows" and "Merge All Windows" but can't # correctly predict their exact scoring since we don't have # an exact match that would prefer 'merge windows'. Both should be # the first two matches though. - assert addons[1].pk in result[:2] - assert addons[0].pk in result[:2] + assert addons[1].pk in results[:2] + assert addons[0].pk in results[:2] response = self.client.get(self.url, {'q': 'merge all windows'}) - result = self.get_results(response) + results = self.get_results(response) # Make sure we match 'All Downloader Professional' but it's # term match frequency is much lower than the other two so it's # last. - assert addons[2].pk == result[-1] + assert addons[2].pk == results[-1] # Other two are first rank again. - assert addons[1].pk in result[:2] - assert addons[0].pk in result[:2] + assert addons[1].pk in results[:2] + assert addons[0].pk in results[:2] def test_score_boost_name_match_slop(self): addon = amo.tests.addon_factory( @@ -760,9 +760,54 @@ class TestSearchResultScoring(ESTestCase): # direct match response = self.client.get(self.url, {'q': 'merge windows'}) - result = self.get_results(response) + results = self.get_results(response) - assert result[0] == addon.pk + assert results[0] == addon.pk + + def test_score_boost_exact_match(self): + """Test that we rank exact matches at the top.""" + addons = [ + amo.tests.addon_factory( + name='test addon test11', type=amo.ADDON_EXTENSION, + average_daily_users=0, weekly_downloads=0), + amo.tests.addon_factory( + name='test addon test21', type=amo.ADDON_EXTENSION, + average_daily_users=0, weekly_downloads=0), + amo.tests.addon_factory( + name='test addon test31', type=amo.ADDON_EXTENSION, + average_daily_users=0, weekly_downloads=0), + ] + + self.refresh() + + response = self.client.get(self.url, {'q': 'test addon test21'}) + results = self.get_results(response) + + assert results[0] == addons[1].pk + + def test_score_boost_exact_match_description_hijack(self): + """Test that we rank exact matches at the top.""" + addons = [ + amo.tests.addon_factory( + name='1-Click YouTube Video Download', + type=amo.ADDON_EXTENSION, + average_daily_users=566337, weekly_downloads=150000, + description=( + 'button, click that button, 1-Click Youtube Video ' + 'Downloader is a click click great tool')), + amo.tests.addon_factory( + name='Amazon 1-Click Lock', type=amo.ADDON_EXTENSION, + average_daily_users=50, weekly_downloads=0), + ] + + self.refresh() + + response = self.client.get(self.url, { + 'q': 'Amazon 1-Click Lock' + }) + results = self.get_results(response) + + assert results[0] == addons[1].pk class TestPersonaSearch(SearchBase):