internal/postgres: change symbolsearch to index underscores

The symbol search insert and update queries are changed to index
underscores as slashes. For example, "A_B" is indexed as "A/B".

Underscores are treated as "blanks" by the postgres parser. However, we
want "A_B" to be ranked lower than just "A" in a search for "A", not
equally.

For golang/go#44142

Change-Id: I4399c046adae33f69cb806012a68127ab21cb937
Reviewed-on: https://go-review.googlesource.com/c/pkgsite/+/332413
Trust: Julie Qiu <julie@golang.org>
Run-TryBot: Julie Qiu <julie@golang.org>
TryBot-Result: kokoro <noreply+kokoro@google.com>
Reviewed-by: Jonathan Amsterdam <jba@google.com>
This commit is contained in:
Julie Qiu 2021-07-02 11:30:29 -04:00
Родитель f35d711d9d
Коммит ebfc9de34b
1 изменённых файлов: 27 добавлений и 6 удалений

Просмотреть файл

@ -41,14 +41,16 @@ func upsertSymbolSearchDocuments(ctx context.Context, tx *database.DB,
s.id,
u.id,` +
// Index <package>.<identifier> (i.e. "sql.DB.Begin")
`SETWEIGHT( TO_TSVECTOR('simple', concat(s.name, ' ', concat(u.name, '.', s.name))), 'A') ||` +
symbolSetWeight("concat(s.name, ' ', concat(u.name, '.', s.name))", "A") + " || " +
// Index <identifier>, including the parent name (i.e. DB.Begin).
`SETWEIGHT( TO_TSVECTOR('simple', s.name), 'A') ||` +
symbolSetWeight("s.name", "A") + " || " +
// Index <identifier> without parent name (i.e. "Begin").
//
// This is weighted less, so that if other symbols are just named
// "Begin" they will rank higher in a search for "Begin".
`SETWEIGHT( TO_TSVECTOR('simple', split_part(s.name, '.', 2)), 'B') AS tokens` +
symbolSetWeight("split_part(s.name, '.', 2)", "C") +
// TODO(https://golang.org/issue/44142): allow searching for "A_B" when
// querying for either "A" or "B", but at a lower rank.
`
FROM symbol_names s
INNER JOIN package_symbols ps ON s.id = ps.symbol_name_id
@ -107,8 +109,8 @@ func (db *DB) symbolSearch(ctx context.Context, q string, limit, offset, maxResu
INNER JOIN documentation_symbols ds ON ds.documentation_id = d.id
INNER JOIN package_symbols ps ON ps.id = ds.package_symbol_id
WHERE
ssd.tsv_symbol_tokens @@ to_tsquery('simple', $1)
ORDER BY
ssd.tsv_symbol_tokens @@ `+symbolToTSQuery+
`ORDER BY
symbol_name,
CASE WHEN goos = 'all' THEN 0
WHEN goos = 'linux' THEN 1
@ -166,8 +168,27 @@ func (db *DB) symbolSearch(ctx context.Context, q string, limit, offset, maxResu
}
}
// symbolTextSearchConfiguration is the search configuration that is used for
// indexing and searching for symbols.
const symbolTextSearchConfiguration = "simple"
// processSymbol converts a symbol with underscores to slashes (for example,
// "A_B" -> "A/B"). This is because the postgres parser treats underscores as
// slashes, but we want a search for "A" to rank "A_B" lower than just "A". We
// also want to be able to search specificially for "A_B".
func processSymbol(s string) string {
return fmt.Sprintf("replace(%s, '_', '/')", s)
}
var symbolToTSQuery = fmt.Sprintf("to_tsquery('%s', %s)", symbolTextSearchConfiguration, processSymbol("$1"))
func symbolSetWeight(s, w string) string {
return fmt.Sprintf("SETWEIGHT(TO_TSVECTOR('%s', %s), '%s')",
symbolTextSearchConfiguration, processSymbol(s), w)
}
var symbolScoreExpr = fmt.Sprintf(`
ts_rank('{0.1, 0.2, 1.0, 1.0}', ssd.tsv_symbol_tokens, to_tsquery('simple', $1)) *
ts_rank('{0.1, 0.2, 1.0, 1.0}', ssd.tsv_symbol_tokens, `+symbolToTSQuery+`) *
ln(exp(1)+imported_by_count) *
CASE WHEN u.redistributable THEN 1 ELSE %f END *
CASE WHEN COALESCE(has_go_mod, true) THEN 1 ELSE %f END