internal/postgres: optimize search query

The search query is rewritten to:

* use documents.tsv_search_tokens instead of readme_tokens ||
  synopsis_tokens || name_tokens || path_tokens
* query directly from documents, packages, versions, imports and
  vw_package_licenses instead of using vw_search_results
* filter out results where relevance is < 1e-10 (instead of rank)

Based on my local database, which has 7185 packages, 144 versions and returns
800+ results for the search term "cloud", this reduced search time for "cloud"
from ~2.5secs to less than 300ms.

Updates b/130090305

Change-Id: Ida419e0d349b38149bec0e41db53b8bf0be18710
Reviewed-on: https://team-review.git.corp.google.com/c/golang/discovery/+/456457
Reviewed-by: Robert Findley <rfindley@google.com>
This commit is contained in:
Julie Qiu 2019-04-29 13:56:13 -04:00
Родитель 58338d760e
Коммит 1819c48b02
2 изменённых файлов: 56 добавлений и 52 удалений

Просмотреть файл

@ -74,11 +74,15 @@ func prepareAndExec(tx *sql.Tx, query string, stmtFunc func(*sql.Stmt) error) (e
}
defer func() {
if err = stmt.Close(); err != nil {
err = fmt.Errorf("stmt.Close: %v", err)
cerr := stmt.Close()
if err == nil {
err = cerr
}
}()
return stmtFunc(stmt)
if err := stmtFunc(stmt); err != nil {
return fmt.Errorf("stmtFunc(stmt): %v", err)
}
return nil
}
// buildInsertQuery builds an multi-value insert query, following the format:

Просмотреть файл

@ -32,19 +32,16 @@ func (db *DB) InsertDocuments(ctx context.Context, version *internal.Version) er
module_path,
series_path,
version,
name_tokens,
path_tokens,
synopsis_tokens,
readme_tokens
tsv_search_tokens
) VALUES(
$1,
$2,
$3,
$4,
$5,
SETWEIGHT(TO_TSVECTOR($6), 'A'),
SETWEIGHT(TO_TSVECTOR($7), 'A'),
SETWEIGHT(TO_TSVECTOR($8), 'B'),
SETWEIGHT(TO_TSVECTOR($6), 'A') ||
SETWEIGHT(TO_TSVECTOR($7), 'A') ||
SETWEIGHT(TO_TSVECTOR($8), 'B') ||
SETWEIGHT(TO_TSVECTOR($9), 'C')
) ON CONFLICT DO NOTHING;`, func(stmt *sql.Stmt) error {
for _, p := range version.Packages {
@ -82,49 +79,52 @@ func (db *DB) Search(ctx context.Context, terms []string, limit, offset int) ([]
if len(terms) == 0 {
return nil, derrors.InvalidArgument(fmt.Sprintf("cannot search: no terms"))
}
query := `WITH results AS (
SELECT
package_path,
version,
module_path,
name,
synopsis,
license_types,
license_paths,
commit_time,
num_imported_by,
(
ts_rank (
name_tokens ||
path_tokens ||
synopsis_tokens ||
readme_tokens, to_tsquery($1)
) * log(exp(1)+num_imported_by)
) AS rank
FROM
vw_search_results
)
SELECT
r.package_path,
r.version,
r.module_path,
r.name,
r.synopsis,
r.license_types,
r.license_paths,
r.commit_time,
r.num_imported_by,
r.rank,
COUNT(*) OVER() AS total
FROM
results r
WHERE
r.rank > POWER(10,-10)
ORDER BY
r.rank DESC
LIMIT $2
OFFSET $3;`
query := `
WITH imported_by AS (
SELECT to_path, COALESCE(COUNT(*),0) AS num_imported_by
FROM (SELECT to_path, from_path FROM imports GROUP BY 1,2) i
GROUP BY 1
),
docs AS (
SELECT package_path, MAX(relevance) AS relevance
FROM (
SELECT package_path, version,
ts_rank (tsv_search_tokens, to_tsquery($1)) AS relevance
FROM documents
) d
WHERE relevance > POWER(10,-10)
GROUP BY 1
),
latest_versions AS (
SELECT DISTINCT ON (module_path) module_path, version, commit_time
FROM versions
ORDER BY module_path, major DESC, minor DESC, patch DESC, prerelease DESC
)
SELECT
p.path AS package_path,
v.version,
p.module_path,
p.name,
p.synopsis,
p.license_types,
p.license_paths,
v.commit_time,
COALESCE(i.num_imported_by, 0) AS num_imported_by,
d.relevance * log(exp(1) + COALESCE(i.num_imported_by, 0)) AS rank,
COUNT(*) OVER() AS total
FROM latest_versions v
INNER JOIN vw_licensed_packages p
ON p.module_path = v.module_path
AND p.version=v.version
INNER JOIN docs d
ON d.package_path = p.path
LEFT JOIN imported_by i
ON i.to_path = p.path
ORDER BY rank DESC
LIMIT $2 OFFSET $3;`
rows, err := db.QueryContext(ctx, query, strings.Join(terms, " | "), limit, offset)
if err != nil {
return nil, fmt.Errorf("db.QueryContext(ctx, %s, %q, %d, %d): %v", query, terms, limit, offset, err)