Weight the B section of the search document the same as the A section.

This will the synopsis and the path equal weight, which will improve
search results.

This CL is just preparation for that. It won't take effect until
we change the code to call the popular_search function instead of
popular_search_go_mod.

Change-Id: Ic5ca03ef1296a4852c80d9aa1c145644acd7dd9c
Reviewed-on: https://team-review.git.corp.google.com/c/golang/discovery/+/705158
Reviewed-by: Robert Findley <rfindley@google.com>
Reviewed-by: Julie Qiu <julieqiu@google.com>
This commit is contained in:
Jonathan Amsterdam 2020-03-30 17:15:08 -04:00 коммит произвёл Julie Qiu
Родитель 16528322e9
Коммит f55f7d8cba
2 изменённых файлов: 147 добавлений и 0 удалений

Просмотреть файл

@ -0,0 +1,83 @@
-- Copyright 2020 The Go Authors. All rights reserved.
-- Use of this source code is governed by a BSD-style
-- license that can be found in the LICENSE file.
BEGIN;
CREATE OR REPLACE FUNCTION popular_search(rawquery text, lim integer, off integer) RETURNS SETOF search_result
LANGUAGE plpgsql
AS $$
DECLARE cur CURSOR(query TSQUERY) FOR
SELECT
package_path,
module_path,
version,
commit_time,
imported_by_count,
(
ts_rank(tsv_search_tokens, query) *
ln(exp(1)+imported_by_count) *
CASE WHEN redistributable THEN 1 ELSE 0.5 END *
-- Rather than add this `tsv_search_tokens @@ query` check to a
-- where clause, we simply annihilate the score. Adding it to the
-- where clause caused the query planner to eventually decide to
-- use the tsv_search_token gin index rather than the popular
-- index, which is exactly what this stored proc is trying to
-- avoid.
-- It seems like this should be redundant with the ts_rank factor
-- above, but in fact it is possible for ts_rank to be nonzero, yet
-- tsv_search_tokens @@ query is false (I think because ts_rank doesn't
-- have special handling for AND or OR conjunctions).
CASE WHEN tsv_search_tokens @@ query THEN 1 ELSE 0 END
) score
FROM search_documents
-- This should use the popular document index.
ORDER BY imported_by_count DESC;
-- top is the top search results, sorted by score descending, commit time
-- descending, then package_path ascending.
top search_result[];
-- res is the current search result.
res search_result;
-- last_idx is the index of the last element in top.
last_idx INT;
BEGIN
last_idx := lim+off;
top := array_fill(NULL::search_result, array[last_idx]);
OPEN cur(query := websearch_to_tsquery(rawquery));
FETCH cur INTO res;
WHILE found LOOP
IF top[last_idx] IS NULL OR res.score >= top[last_idx].score THEN
-- Insert res into top, maintaining sort order.
FOR i IN 1..last_idx LOOP
-- We want to preserve order by score desc, commit_time desc,
-- package_path asc, so insert res as soon as it sorted before top[i]
-- according to this ordering.
IF top[i] IS NULL OR
(res.score > top[i].score) OR
(res.score = top[i].score AND res.commit_time > top[i].commit_time) OR
(res.score = top[i].score AND res.commit_time = top[i].commit_time AND
res.package_path < top[i].package_path) THEN
top := (top[1:i-1] || res) || top[i:last_idx-1];
EXIT;
END IF;
END LOOP;
END IF;
IF top[last_idx].score > ln(exp(1)+res.imported_by_count) THEN
-- No subsequent document can be scored higher than our lowest scoring
-- document, as top[last_idx].score > 1.0*ln(e+imported_by_count), and
-- for all subsequent records ts_rank <= 1.0 and ln(e+imported_by_count)
-- is monotonically decreasing.
-- So we're done.
EXIT;
END IF;
FETCH cur INTO res;
END LOOP;
CLOSE cur;
RETURN QUERY SELECT * FROM UNNEST(top[off+1:last_idx])
WHERE package_path IS NOT NULL AND score > 0.1;
END; $$;
COMMENT ON FUNCTION popular_search(rawquery text, lim integer, off integer) IS
'FUNCTION popular_search is used to generate results for search. It is implemented as a stored function, so that we can use a cursor to scan search documents procedurally, and stop scanning early, whenever our search results are provably correct.';
END;

Просмотреть файл

@ -0,0 +1,64 @@
-- Copyright 2020 The Go Authors. All rights reserved.
-- Use of this source code is governed by a BSD-style
-- license that can be found in the LICENSE file.
BEGIN;
-- Redefine the popular_search function, which is currently unused,
-- to be the same as popular_search_go_mod but with a B weight of 1.
CREATE OR REPLACE FUNCTION popular_search(rawquery text, lim integer, off integer, redist_factor real, go_mod_factor real) RETURNS SETOF search_result
LANGUAGE plpgsql
AS $$
DECLARE cur CURSOR(query TSQUERY) FOR
SELECT
package_path,
module_path,
version,
commit_time,
imported_by_count,
(
-- default D, C, B, A weights are {0.1, 0.2, 0.4, 1.0}
ts_rank('{0.1, 0.2, 1.0, 1.0}', tsv_search_tokens, query) *
ln(exp(1)+imported_by_count) *
CASE WHEN redistributable THEN 1 ELSE redist_factor END *
CASE WHEN COALESCE(has_go_mod, true) THEN 1 ELSE go_mod_factor END *
CASE WHEN tsv_search_tokens @@ query THEN 1 ELSE 0 END
) score
FROM search_documents
ORDER BY imported_by_count DESC;
top search_result[];
res search_result;
last_idx INT;
BEGIN
last_idx := lim+off;
top := array_fill(NULL::search_result, array[last_idx]);
OPEN cur(query := websearch_to_tsquery(rawquery));
FETCH cur INTO res;
WHILE found LOOP
IF top[last_idx] IS NULL OR res.score >= top[last_idx].score THEN
FOR i IN 1..last_idx LOOP
IF top[i] IS NULL OR
(res.score > top[i].score) OR
(res.score = top[i].score AND res.commit_time > top[i].commit_time) OR
(res.score = top[i].score AND res.commit_time = top[i].commit_time AND
res.package_path < top[i].package_path) THEN
top := (top[1:i-1] || res) || top[i:last_idx-1];
EXIT;
END IF;
END LOOP;
END IF;
IF top[last_idx].score > ln(exp(1)+res.imported_by_count) THEN
EXIT;
END IF;
FETCH cur INTO res;
END LOOP;
CLOSE cur;
RETURN QUERY SELECT * FROM UNNEST(top[off+1:last_idx])
WHERE package_path IS NOT NULL AND score > 0.1;
END; $$;
COMMENT ON FUNCTION popular_search(rawquery text, lim integer, off integer, redist_factor real, go_mod_factor real) IS
'FUNCTION popular_search is used to generate results for search. It is implemented as a stored function, so that we can use a cursor to scan search documents procedurally, and stop scanning early, whenever our search results are provably correct.';
END;