зеркало из https://github.com/golang/pkgsite.git
internal/postgres: change search tokens
Change the way that the Postgres ts_vector (list of search tokens) is computed. - Use the path_tokens text configuration when creating the tsvector for the path. - Construct sections B and C of the search document by combining the synopsis part of the README. Parts of this processing are: - Extract only the text of a markdown README, to remove images and other extraneous information. - Add alternatives to certain words in the synopsis and README. For example, add "postgresql" whenever we see "postgres". - Modify the ts_rank call in the code to use a B weight of 1. - Change the call to the database search function so that it invokes the function that has a B weight of 1. These changes will require re-computing the search_documents.tsv_search_tokens column. That should be done after these are deployed. Change-Id: Ib81601326f11efd81c8bc733694a000eccecf12b Reviewed-on: https://team-review.git.corp.google.com/c/golang/discovery/+/705958 Reviewed-by: Julie Qiu <julieqiu@google.com>
This commit is contained in:
Родитель
86348f6125
Коммит
ba9b5823bf
|
@ -75,13 +75,8 @@ func (db *DB) InsertModule(ctx context.Context, m *internal.Module) (err error)
|
|||
return err
|
||||
}
|
||||
|
||||
// Insert the module's non-internal packages into search_documents.
|
||||
for _, pkg := range m.Packages {
|
||||
if err := db.UpsertSearchDocument(ctx, pkg.Path); err != nil {
|
||||
return err
|
||||
}
|
||||
}
|
||||
return nil
|
||||
// Insert the module's packages into search_documents.
|
||||
return db.UpsertSearchDocuments(ctx, m)
|
||||
}
|
||||
|
||||
// saveModule inserts a Module into the database along with its packages,
|
||||
|
@ -140,7 +135,7 @@ func (db *DB) saveModule(ctx context.Context, m *internal.Module) error {
|
|||
m.Version,
|
||||
m.CommitTime,
|
||||
m.ReadmeFilePath,
|
||||
makeValidUnicode([]byte(m.ReadmeContents)),
|
||||
makeValidUnicode(m.ReadmeContents),
|
||||
version.ForSorting(m.Version),
|
||||
m.VersionType,
|
||||
m.SeriesPath(),
|
||||
|
@ -158,7 +153,7 @@ func (db *DB) saveModule(ctx context.Context, m *internal.Module) error {
|
|||
return fmt.Errorf("marshalling %+v: %v", l.Coverage, err)
|
||||
}
|
||||
licenseValues = append(licenseValues, m.ModulePath, m.Version,
|
||||
l.FilePath, makeValidUnicode(l.Contents), pq.Array(l.Types), covJSON)
|
||||
l.FilePath, makeValidUnicode(string(l.Contents)), pq.Array(l.Types), covJSON)
|
||||
}
|
||||
if len(licenseValues) > 0 {
|
||||
licenseCols := []string{
|
||||
|
@ -366,11 +361,11 @@ func (db *DB) DeleteModule(ctx context.Context, ddb *database.DB, modulePath, ve
|
|||
return err
|
||||
}
|
||||
|
||||
// makeValidUnicode removes null runes from license contents, because pq doesn't like them.
|
||||
// Also, replace non-unicode characters with the Unicode replacement character, which is
|
||||
// the behavior of for ... range on strings.
|
||||
func makeValidUnicode(bs []byte) string {
|
||||
s := string(bs)
|
||||
// makeValidUnicode removes null runes from a string that will be saved in a
|
||||
// column of type TEXT, because pq doesn't like them. It also replaces non-unicode
|
||||
// characters with the Unicode replacement character, which is the behavior of
|
||||
// for ... range on strings.
|
||||
func makeValidUnicode(s string) string {
|
||||
var b strings.Builder
|
||||
for _, r := range s {
|
||||
if r != 0 {
|
||||
|
|
|
@ -323,7 +323,7 @@ func TestMakeValidUnicode(t *testing.T) {
|
|||
if (err == nil) != okRaw {
|
||||
t.Errorf("%s, raw: got %v, want error: %t", filename, err, okRaw)
|
||||
}
|
||||
if err := insert(makeValidUnicode(data)); err != nil {
|
||||
if err := insert(makeValidUnicode(string(data))); err != nil {
|
||||
t.Errorf("%s, after making valid: %v", filename, err)
|
||||
}
|
||||
}
|
||||
|
|
|
@ -9,7 +9,6 @@ import (
|
|||
"database/sql"
|
||||
"fmt"
|
||||
"math"
|
||||
"sort"
|
||||
"strings"
|
||||
"time"
|
||||
|
||||
|
@ -140,8 +139,11 @@ const (
|
|||
// dramatic: being 2x as popular only has an additive effect.
|
||||
// - A penalty factor for non-redistributable modules, since a lot of
|
||||
// details cannot be displayed.
|
||||
// The first argument to ts_rank is an array of weights for the four tsvector sections,
|
||||
// in the order D, C, B, A.
|
||||
// The weights below match the defaults except for B.
|
||||
var scoreExpr = fmt.Sprintf(`
|
||||
ts_rank(tsv_search_tokens, websearch_to_tsquery($1)) *
|
||||
ts_rank('{0.1, 0.2, 1.0, 1.0}', tsv_search_tokens, websearch_to_tsquery($1)) *
|
||||
ln(exp(1)+imported_by_count) *
|
||||
CASE WHEN redistributable THEN 1 ELSE %f END *
|
||||
CASE WHEN COALESCE(has_go_mod, true) THEN 1 ELSE %f END
|
||||
|
@ -392,7 +394,7 @@ func (db *DB) popularSearch(ctx context.Context, searchQuery string, limit, offs
|
|||
commit_time,
|
||||
imported_by_count,
|
||||
score
|
||||
FROM popular_search_go_mod($1, $2, $3, $4, $5)`
|
||||
FROM popular_search($1, $2, $3, $4, $5)`
|
||||
var results []*internal.SearchResult
|
||||
collect := func(rows *sql.Rows) error {
|
||||
var r internal.SearchResult
|
||||
|
@ -496,13 +498,10 @@ var upsertSearchStatement = fmt.Sprintf(`
|
|||
m.commit_time,
|
||||
m.has_go_mod,
|
||||
(
|
||||
SETWEIGHT(TO_TSVECTOR($2), 'A') ||
|
||||
-- Try to limit to the maximum length of a tsvector.
|
||||
-- This is just a guess, since the max length is in bytes and there
|
||||
-- doesn't seem to be a way to determine the number of bytes in a tsvector.
|
||||
-- Since the max is 1048575, make sure part is half that size.
|
||||
SETWEIGHT(TO_TSVECTOR(left(p.synopsis, 1048575/2)), 'B') ||
|
||||
SETWEIGHT(TO_TSVECTOR(left(m.readme_contents, 1048575/2)), 'C')
|
||||
SETWEIGHT(TO_TSVECTOR('path_tokens', $2), 'A') ||
|
||||
SETWEIGHT(TO_TSVECTOR($3), 'B') ||
|
||||
SETWEIGHT(TO_TSVECTOR($4), 'C') ||
|
||||
SETWEIGHT(TO_TSVECTOR($5), 'D')
|
||||
),
|
||||
hll_hash(p.path) & (%[1]d - 1),
|
||||
hll_zeros(hll_hash(p.path))
|
||||
|
@ -544,52 +543,86 @@ var upsertSearchStatement = fmt.Sprintf(`
|
|||
END)
|
||||
;`, hllRegisterCount)
|
||||
|
||||
// UpsertSearchDocuments adds search information for mod ot the search_documents table.
|
||||
func (db *DB) UpsertSearchDocuments(ctx context.Context, mod *internal.Module) (err error) {
|
||||
defer derrors.Wrap(&err, "UpsertSearchDocuments(ctx, %q)", mod.ModulePath)
|
||||
|
||||
for _, pkg := range mod.Packages {
|
||||
if isInternalPackage(pkg.Path) {
|
||||
continue
|
||||
}
|
||||
err := db.UpsertSearchDocument(ctx, upsertSearchDocumentArgs{
|
||||
PackagePath: pkg.Path,
|
||||
ModulePath: mod.ModulePath,
|
||||
Synopsis: pkg.Synopsis,
|
||||
ReadmeFilePath: mod.ReadmeFilePath,
|
||||
ReadmeContents: mod.ReadmeContents,
|
||||
})
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
type upsertSearchDocumentArgs struct {
|
||||
PackagePath string
|
||||
ModulePath string
|
||||
Synopsis string
|
||||
ReadmeFilePath string
|
||||
ReadmeContents string
|
||||
}
|
||||
|
||||
// UpsertSearchDocument inserts a row for each package in the module, if that
|
||||
// package is the latest version and is not internal.
|
||||
//
|
||||
// The given module should have already been validated via a call to
|
||||
// validateModule.
|
||||
func (db *DB) UpsertSearchDocument(ctx context.Context, path string) (err error) {
|
||||
defer derrors.Wrap(&err, "UpsertSearchDocument(ctx, %q)", path)
|
||||
func (db *DB) UpsertSearchDocument(ctx context.Context, args upsertSearchDocumentArgs) (err error) {
|
||||
defer derrors.Wrap(&err, "UpsertSearchDocument(ctx, %q, %q)", args.PackagePath, args.ModulePath)
|
||||
|
||||
if isInternalPackage(path) {
|
||||
return nil
|
||||
// Only summarize the README if the package and module have the same path.
|
||||
if args.PackagePath != args.ModulePath {
|
||||
args.ReadmeFilePath = ""
|
||||
args.ReadmeContents = ""
|
||||
}
|
||||
pathTokens := strings.Join(GeneratePathTokens(path), " ")
|
||||
_, err = db.db.Exec(ctx, upsertSearchStatement, path, pathTokens)
|
||||
pathTokens := strings.Join(GeneratePathTokens(args.PackagePath), " ")
|
||||
sectionB, sectionC, sectionD := SearchDocumentSections(args.Synopsis, args.ReadmeFilePath, args.ReadmeContents)
|
||||
_, err = db.db.Exec(ctx, upsertSearchStatement, args.PackagePath, pathTokens, sectionB, sectionC, sectionD)
|
||||
return err
|
||||
}
|
||||
|
||||
// GetPackagesForSearchDocumentUpsert fetches all paths from packages that do
|
||||
// not exist in search_documents.
|
||||
func (db *DB) GetPackagesForSearchDocumentUpsert(ctx context.Context, limit int) (paths []string, err error) {
|
||||
func (db *DB) GetPackagesForSearchDocumentUpsert(ctx context.Context, limit int) (argsList []upsertSearchDocumentArgs, err error) {
|
||||
defer derrors.Add(&err, "GetPackagesForSearchDocumentUpsert(ctx, %d)", limit)
|
||||
|
||||
query := `
|
||||
SELECT DISTINCT(path)
|
||||
SELECT DISTINCT ON (p.path) p.path, m.module_path, p.synopsis, m.readme_file_path, m.readme_contents
|
||||
FROM packages p
|
||||
INNER JOIN modules m
|
||||
USING (module_path, version)
|
||||
LEFT JOIN search_documents sd
|
||||
ON p.path = sd.package_path
|
||||
WHERE sd.package_path IS NULL
|
||||
LIMIT $1`
|
||||
|
||||
collect := func(rows *sql.Rows) error {
|
||||
var path string
|
||||
if err := rows.Scan(&path); err != nil {
|
||||
var a upsertSearchDocumentArgs
|
||||
if err := rows.Scan(&a.PackagePath, &a.ModulePath, &a.Synopsis, &a.ReadmeFilePath, &a.ReadmeContents); err != nil {
|
||||
return err
|
||||
}
|
||||
// Filter out packages in internal directories, since
|
||||
// they are skipped when upserting search_documents.
|
||||
if !isInternalPackage(path) {
|
||||
paths = append(paths, path)
|
||||
if !isInternalPackage(a.PackagePath) {
|
||||
argsList = append(argsList, a)
|
||||
}
|
||||
return nil
|
||||
}
|
||||
if err := db.db.RunQuery(ctx, query, collect, limit); err != nil {
|
||||
return nil, err
|
||||
}
|
||||
sort.Strings(paths)
|
||||
return paths, nil
|
||||
return argsList, nil
|
||||
}
|
||||
|
||||
// UpdateSearchDocumentsImportedByCount updates imported_by_count and
|
||||
|
|
|
@ -369,6 +369,12 @@ func TestInsertSearchDocumentAndSearch(t *testing.T) {
|
|||
}
|
||||
)
|
||||
|
||||
const (
|
||||
packageScore = 0.6079270839691162
|
||||
goAndCDKScore = 0.999817967414856
|
||||
cloudScore = 0.8654518127441406
|
||||
)
|
||||
|
||||
for _, tc := range []struct {
|
||||
name string
|
||||
packages map[string]*internal.Package
|
||||
|
@ -384,8 +390,8 @@ func TestInsertSearchDocumentAndSearch(t *testing.T) {
|
|||
modKube: pkgKube,
|
||||
},
|
||||
want: []*internal.SearchResult{
|
||||
goCdkResult(0.2431708425283432, 2),
|
||||
kubeResult(0.2431708425283432, 2),
|
||||
goCdkResult(packageScore, 2),
|
||||
kubeResult(packageScore, 2),
|
||||
},
|
||||
},
|
||||
{
|
||||
|
@ -398,7 +404,7 @@ func TestInsertSearchDocumentAndSearch(t *testing.T) {
|
|||
modGoCDK: pkgGoCDK,
|
||||
},
|
||||
want: []*internal.SearchResult{
|
||||
goCdkResult(0.2431708425283432, 2),
|
||||
goCdkResult(packageScore, 2),
|
||||
},
|
||||
},
|
||||
{
|
||||
|
@ -411,7 +417,7 @@ func TestInsertSearchDocumentAndSearch(t *testing.T) {
|
|||
modKube: pkgKube,
|
||||
},
|
||||
want: []*internal.SearchResult{
|
||||
kubeResult(0.2431708425283432, 2),
|
||||
kubeResult(packageScore, 2),
|
||||
},
|
||||
},
|
||||
{
|
||||
|
@ -422,7 +428,7 @@ func TestInsertSearchDocumentAndSearch(t *testing.T) {
|
|||
modKube: pkgKube,
|
||||
},
|
||||
want: []*internal.SearchResult{
|
||||
goCdkResult(0.733867883682251, 1),
|
||||
goCdkResult(goAndCDKScore, 1),
|
||||
},
|
||||
},
|
||||
{
|
||||
|
@ -432,7 +438,7 @@ func TestInsertSearchDocumentAndSearch(t *testing.T) {
|
|||
modGoCDK: pkgGoCDK,
|
||||
},
|
||||
want: []*internal.SearchResult{
|
||||
goCdkResult(0.7109370231628418, 1),
|
||||
goCdkResult(cloudScore, 1),
|
||||
},
|
||||
},
|
||||
} {
|
||||
|
@ -820,28 +826,42 @@ func TestGetPackagesForSearchDocumentUpsert(t *testing.T) {
|
|||
}
|
||||
// pkgPaths should be "A", since pkg "A" exists in packages but not
|
||||
// search_documents.
|
||||
pkgPaths, err := testDB.GetPackagesForSearchDocumentUpsert(ctx, 10)
|
||||
got, err := testDB.GetPackagesForSearchDocumentUpsert(ctx, 10)
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
want := []string{"A", "A/notinternal"}
|
||||
if diff := cmp.Diff(want, pkgPaths); diff != "" {
|
||||
sort.Slice(got, func(i, j int) bool { return got[i].PackagePath < got[j].PackagePath })
|
||||
want := []upsertSearchDocumentArgs{
|
||||
{
|
||||
PackagePath: "A",
|
||||
ModulePath: moduleA.ModulePath,
|
||||
ReadmeFilePath: "README.md",
|
||||
ReadmeContents: "readme",
|
||||
},
|
||||
{
|
||||
PackagePath: "A/notinternal",
|
||||
ModulePath: moduleA.ModulePath,
|
||||
ReadmeFilePath: "README.md",
|
||||
ReadmeContents: "readme",
|
||||
},
|
||||
}
|
||||
if diff := cmp.Diff(want, got); diff != "" {
|
||||
t.Fatalf("testDB.GetPackagesForSearchDocumentUpsert mismatch(-want +got):\n%s", diff)
|
||||
}
|
||||
|
||||
for _, path := range want {
|
||||
if err := testDB.UpsertSearchDocument(ctx, path); err != nil {
|
||||
for _, args := range got {
|
||||
if err := testDB.UpsertSearchDocument(ctx, args); err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
}
|
||||
// pkgPaths should be an empty slice, since pkg "A" and "A/notinternal"
|
||||
// were just inserted into search_documents.
|
||||
pkgPaths, err = testDB.GetPackagesForSearchDocumentUpsert(ctx, 10)
|
||||
got, err = testDB.GetPackagesForSearchDocumentUpsert(ctx, 10)
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
if len(pkgPaths) != 0 {
|
||||
t.Fatalf("expected testDB.GetPackagesForSearchDocumentUpsert to return an empty slice; got %v", pkgPaths)
|
||||
if len(got) != 0 {
|
||||
t.Fatalf("expected testDB.GetPackagesForSearchDocumentUpsert to return an empty slice; got %v", got)
|
||||
}
|
||||
}
|
||||
|
||||
|
|
|
@ -0,0 +1,213 @@
|
|||
// Copyright 2020 The Go Authors. All rights reserved.
|
||||
// Use of this source code is governed by a BSD-style
|
||||
// license that can be found in the LICENSE file.
|
||||
|
||||
package postgres
|
||||
|
||||
import (
|
||||
"path/filepath"
|
||||
"strings"
|
||||
"unicode"
|
||||
|
||||
"github.com/russross/blackfriday/v2"
|
||||
)
|
||||
|
||||
const (
|
||||
maxSectionWords = 50
|
||||
maxReadmeFraction = 0.5
|
||||
)
|
||||
|
||||
// SearchDocumentSections computes the B and C sections of a Postgres search
|
||||
// document from a package synopsis and a README.
|
||||
// By "B section" and "C section" we mean the portion of the tsvector with weight
|
||||
// "B" and "C", respectively.
|
||||
//
|
||||
// The B section consists of the synopsis.
|
||||
// The C section consists of the first sentence of the README.
|
||||
// The D section consists of the remainder of the README.
|
||||
// All sections are split into words and processed for replacements.
|
||||
// Each section is limited to maxSectionWords words, and in addition the
|
||||
// D section is limited to an initial fraction of the README, determined
|
||||
// by maxReadmeFraction.
|
||||
func SearchDocumentSections(synopsis, readmeFilename, readme string) (b, c, d string) {
|
||||
return searchDocumentSections(synopsis, readmeFilename, readme, maxSectionWords, maxReadmeFraction)
|
||||
}
|
||||
|
||||
func searchDocumentSections(synopsis, readmeFilename, readme string, maxSecWords int, maxReadmeFrac float64) (b, c, d string) {
|
||||
var readmeFirst, readmeRest string
|
||||
if isMarkdown(readmeFilename) {
|
||||
readme = processMarkdown(readme)
|
||||
}
|
||||
if i := sentenceEndIndex(readme); i > 0 {
|
||||
readmeFirst, readmeRest = readme[:i+1], readme[i+1:]
|
||||
} else {
|
||||
readmeRest = readme
|
||||
}
|
||||
sw := processWords(synopsis)
|
||||
rwf := processWords(readmeFirst)
|
||||
rwr := processWords(readmeRest)
|
||||
|
||||
sectionB, _ := split(sw, maxSecWords)
|
||||
sectionC, rwfd := split(rwf, maxSecWords)
|
||||
// section D is the part of the readme that is not in sectionC.
|
||||
rwd := append(rwfd, rwr...)
|
||||
// Keep maxSecWords of section D, but not more than maxReadmeFrac.
|
||||
f := int(maxReadmeFrac * float64(len(rwd)))
|
||||
nkeep := maxSecWords
|
||||
if nkeep > f {
|
||||
nkeep = f
|
||||
}
|
||||
sectionD, _ := split(rwd, nkeep)
|
||||
|
||||
// If there is no synopsis, use first sentence of the README.
|
||||
// But do not promote the rest of the README to section C.
|
||||
if len(sectionB) == 0 {
|
||||
sectionB = sectionC
|
||||
sectionC = nil
|
||||
}
|
||||
|
||||
prep := func(ws []string) string {
|
||||
return makeValidUnicode(strings.Join(ws, " "))
|
||||
}
|
||||
|
||||
return prep(sectionB), prep(sectionC), prep(sectionD)
|
||||
}
|
||||
|
||||
// split splits a slice of strings into two parts. The first has length <= n,
|
||||
// and the second is the rest of the slice. If n is negative, the first part is nil and
|
||||
// the second part is the entire slice.
|
||||
func split(a []string, n int) ([]string, []string) {
|
||||
if n >= len(a) {
|
||||
return a, nil
|
||||
}
|
||||
return a[:n], a[n:]
|
||||
}
|
||||
|
||||
// sentenceEndIndex returns the index in s of the end of the first sentence, or
|
||||
// -1 if no end can be found. A sentence ends at a '.', '!' or '?' that is
|
||||
// followed by a space (or ends the string), and is not preceeded by an
|
||||
// uppercase letter.
|
||||
func sentenceEndIndex(s string) int {
|
||||
var prev1, prev2 rune
|
||||
|
||||
end := func() bool {
|
||||
return !unicode.IsUpper(prev2) && (prev1 == '.' || prev1 == '!' || prev1 == '?')
|
||||
}
|
||||
|
||||
for i, r := range s {
|
||||
if unicode.IsSpace(r) && end() {
|
||||
return i - 1
|
||||
}
|
||||
prev2 = prev1
|
||||
prev1 = r
|
||||
}
|
||||
if end() {
|
||||
return len(s) - 1
|
||||
}
|
||||
return -1
|
||||
}
|
||||
|
||||
// processWords splits s into words at whitespace, then processes each word.
|
||||
func processWords(s string) []string {
|
||||
fields := strings.Fields(strings.ToLower(s))
|
||||
var words []string
|
||||
for _, f := range fields {
|
||||
words = append(words, processWord(f)...)
|
||||
}
|
||||
return words
|
||||
}
|
||||
|
||||
// summaryReplacements is used to replace words with other words.
|
||||
// It is used by processWord, below.
|
||||
// Example key-value pairs:
|
||||
// "deleteMe": nil // removes "deleteMe"
|
||||
// "rand": []string{"random"} // replace "rand" with "random"
|
||||
// "utf-8": []string{"utf-8", "utf8"} // add "utf8" whenever "utf-8" is seen
|
||||
var summaryReplacements = map[string][]string{
|
||||
"postgres": []string{"postgres", "postgresql"},
|
||||
"postgresql": []string{"postgres", "postgresql"},
|
||||
"rand": []string{"random"},
|
||||
"mongo": []string{"mongo", "mongodb"},
|
||||
"mongodb": []string{"mongo", "mongodb"},
|
||||
"redis": []string{"redis", "redisdb"},
|
||||
"redisdb": []string{"redis", "redisdb"},
|
||||
"logger": []string{"logger", "log"}, // Postgres stemmer does not handle -er
|
||||
"parser": []string{"parser", "parse"},
|
||||
"utf-8": []string{"utf-8", "utf8"},
|
||||
}
|
||||
|
||||
// processWord performs processing on s, returning zero or more words.
|
||||
// Its main purpose is to apply summaryReplacements to replace
|
||||
// certain words with synonyms or additional search terms.
|
||||
func processWord(s string) []string {
|
||||
s = strings.TrimFunc(s, unicode.IsPunct)
|
||||
if s == "" {
|
||||
return nil
|
||||
}
|
||||
if rs, ok := summaryReplacements[s]; ok {
|
||||
return rs
|
||||
}
|
||||
if !hyphenSplit(s) {
|
||||
return []string{s}
|
||||
}
|
||||
// Apply replacements to parts of hyphenated words.
|
||||
ws := strings.Split(s, "-")
|
||||
if len(ws) == 1 {
|
||||
return ws
|
||||
}
|
||||
result := []string{s} // Include the full hyphenated word.
|
||||
for _, w := range ws {
|
||||
if rs, ok := summaryReplacements[w]; ok {
|
||||
result = append(result, rs...)
|
||||
}
|
||||
// We don't need to include the parts; the Postgres text-search processor will do that.
|
||||
}
|
||||
return result
|
||||
}
|
||||
|
||||
// hyphenSplit reports whether s should be split on hyphens.
|
||||
func hyphenSplit(s string) bool {
|
||||
return !(strings.HasPrefix(s, "http://") || strings.HasPrefix(s, "https://"))
|
||||
}
|
||||
|
||||
// isMarkdown reports whether filename says that the file contains markdown.
|
||||
func isMarkdown(filename string) bool {
|
||||
ext := strings.ToLower(filepath.Ext(filename))
|
||||
// https://tools.ietf.org/html/rfc7763 mentions both extensions.
|
||||
return ext == ".md" || ext == ".markdown"
|
||||
}
|
||||
|
||||
// processMarkdown returns the text of a markdown document.
|
||||
// It omits all formatting and images.
|
||||
func processMarkdown(s string) string {
|
||||
parser := blackfriday.New(blackfriday.WithExtensions(blackfriday.CommonExtensions))
|
||||
root := parser.Parse([]byte(s))
|
||||
buf := walkMarkdown(root, nil, 0)
|
||||
return string(buf)
|
||||
}
|
||||
|
||||
// walkMarkdown traverses a blackfriday parse tree, extracting text.
|
||||
func walkMarkdown(n *blackfriday.Node, buf []byte, level int) []byte {
|
||||
if n == nil {
|
||||
return buf
|
||||
}
|
||||
switch n.Type {
|
||||
case blackfriday.Image:
|
||||
// Skip images because they usually are irrelevant to the package
|
||||
// (badges and such).
|
||||
return buf
|
||||
case blackfriday.CodeBlock:
|
||||
// Skip code blocks because they have a wide variety of unrelated symbols.
|
||||
return buf
|
||||
case blackfriday.Paragraph, blackfriday.Heading:
|
||||
if len(buf) > 0 {
|
||||
buf = append(buf, ' ')
|
||||
}
|
||||
default:
|
||||
buf = append(buf, n.Literal...)
|
||||
}
|
||||
for c := n.FirstChild; c != nil; c = c.Next {
|
||||
buf = walkMarkdown(c, buf, level+1)
|
||||
}
|
||||
return buf
|
||||
}
|
|
@ -0,0 +1,133 @@
|
|||
// Copyright 2020 The Go Authors. All rights reserved.
|
||||
// Use of this source code is governed by a BSD-style
|
||||
// license that can be found in the LICENSE file.
|
||||
|
||||
package postgres
|
||||
|
||||
import (
|
||||
"testing"
|
||||
|
||||
"github.com/google/go-cmp/cmp"
|
||||
)
|
||||
|
||||
func TestSearchDocumentSections(t *testing.T) {
|
||||
for _, test := range []struct {
|
||||
name string
|
||||
synopsis string
|
||||
readmeFilename string
|
||||
readmeContents string
|
||||
wantB, wantC, wantD string
|
||||
}{
|
||||
{
|
||||
"blackfriday",
|
||||
"This is a synopsis.",
|
||||
"foo.md",
|
||||
`Package blackfriday is a [markdown](http://foo) processor. That _is_ all that it is.`,
|
||||
|
||||
"this is a synopsis",
|
||||
"package blackfriday is a markdown processor",
|
||||
"that is all",
|
||||
},
|
||||
{
|
||||
"non-markdown",
|
||||
"This synopsis is too long so we'll truncate it.",
|
||||
"README",
|
||||
"This README doesn't have a sentence end so the whole thing is D",
|
||||
|
||||
"this synopsis is too long so",
|
||||
"",
|
||||
"this readme doesn't have a sentence",
|
||||
},
|
||||
{
|
||||
"viper",
|
||||
"",
|
||||
"README.md",
|
||||
`
|
||||
![viper logo](https://cloud.githubusercontent.com/assets/173412/10886745/998df88a-8151-11e5-9448-4736db51020d.png)
|
||||
|
||||
Go configuration with fangs!
|
||||
|
||||
[![Actions](https://github.com/spf13/viper/workflows/CI/badge.svg)](https://github.com/spf13/viper)
|
||||
[![Join the chat at https://gitter.im/spf13/viper](https://badges.gitter.im/Join%20Chat.svg)](https://gitter.im/spf13/viper?utm_source=badge&utm_medium=badge&utm_campaign=pr-badge&utm_content=badge)
|
||||
[![GoDoc](https://godoc.org/github.com/spf13/viper?status.svg)](https://godoc.org/github.com/spf13/viper)
|
||||
|
||||
Many Go projects are built using Viper including:`,
|
||||
|
||||
"go configuration with fangs", // first sentence of README promoted
|
||||
"",
|
||||
"many go projects are",
|
||||
},
|
||||
} {
|
||||
gotB, gotC, gotD := searchDocumentSections(test.synopsis, test.readmeFilename, test.readmeContents, 6, 0.5)
|
||||
if gotB != test.wantB {
|
||||
t.Errorf("%s, B: got %q, want %q", test.name, gotB, test.wantB)
|
||||
}
|
||||
if gotC != test.wantC {
|
||||
t.Errorf("%s, C: got %q, want %q", test.name, gotC, test.wantC)
|
||||
}
|
||||
if gotD != test.wantD {
|
||||
t.Errorf("%s, D: got %q, want %q", test.name, gotD, test.wantD)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func TestProcessWords(t *testing.T) {
|
||||
for _, test := range []struct {
|
||||
in string
|
||||
want []string
|
||||
}{
|
||||
{"", nil},
|
||||
{"foo", []string{"foo"}},
|
||||
{" foo \t bar\n", []string{"foo", "bar"}},
|
||||
{"http://foo/bar/baz?x=1", []string{"http://foo/bar/baz?x=1"}},
|
||||
{"This, however, shall. not; stand?", []string{"this", "however", "shall", "not", "stand"}},
|
||||
{"a postgres and NATS server over HTTP", []string{
|
||||
"a", "postgres", "postgresql", "and", "nats", "server", "over", "http"}},
|
||||
{"http://a-b-c.com full-text chart-parser", []string{
|
||||
"http://a-b-c.com", "full-text", "chart-parser", "parser", "parse"}},
|
||||
} {
|
||||
got := processWords(test.in)
|
||||
if !cmp.Equal(got, test.want) {
|
||||
t.Errorf("%q:\ngot %#v\nwant %#v", test.in, got, test.want)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func TestProcessMarkdown(t *testing.T) {
|
||||
const (
|
||||
in = `
|
||||
Blackfriday [![Build Status](https://travis-ci.org/russross/blackfriday.svg?branch=master)](https://travis-ci.org/russross/blackfriday)
|
||||
===========
|
||||
|
||||
_Blackfriday_ is a [Markdown][1] *processor* implemented in [Go](https://golang.org).
|
||||
|
||||
[1]: https://daringfireball.net/projects/markdown/ "Markdown"
|
||||
`
|
||||
|
||||
want = `Blackfriday Blackfriday is a Markdown processor implemented in Go.`
|
||||
)
|
||||
|
||||
got := processMarkdown(in)
|
||||
if got != want {
|
||||
t.Errorf("got\n%s\nwant\n%s", got, want)
|
||||
}
|
||||
}
|
||||
|
||||
func TestSentenceEndIndex(t *testing.T) {
|
||||
for _, test := range []struct {
|
||||
in string
|
||||
want int
|
||||
}{
|
||||
{"", -1},
|
||||
{"Hello. What's up?", 5},
|
||||
{"unicode π∆!", 13},
|
||||
{"D. C. Fontana?", 13},
|
||||
{"D. c. Fontana?", 4},
|
||||
{"no end", -1},
|
||||
} {
|
||||
got := sentenceEndIndex(test.in)
|
||||
if got != test.want {
|
||||
t.Errorf("%s: got %d, want %d", test.in, got, test.want)
|
||||
}
|
||||
}
|
||||
}
|
|
@ -163,16 +163,16 @@ func (s *Server) handlePopulateSearchDocuments(w http.ResponseWriter, r *http.Re
|
|||
limit := parseIntParam(r, "limit", 100)
|
||||
ctx := r.Context()
|
||||
log.Infof(ctx, "Populating search documents for %d packages", limit)
|
||||
pkgPaths, err := s.db.GetPackagesForSearchDocumentUpsert(ctx, limit)
|
||||
sdargs, err := s.db.GetPackagesForSearchDocumentUpsert(ctx, limit)
|
||||
if err != nil {
|
||||
log.Errorf(ctx, "s.db.GetPackagesSearchDocumentUpsert(ctx): %v", err)
|
||||
http.Error(w, http.StatusText(http.StatusInternalServerError), http.StatusInternalServerError)
|
||||
return
|
||||
}
|
||||
|
||||
for _, path := range pkgPaths {
|
||||
if err := s.db.UpsertSearchDocument(ctx, path); err != nil {
|
||||
log.Errorf(ctx, "s.db.UpsertSearchDocument(ctx, %q): %v", path, err)
|
||||
for _, args := range sdargs {
|
||||
if err := s.db.UpsertSearchDocument(ctx, args); err != nil {
|
||||
log.Errorf(ctx, "s.db.UpsertSearchDocument(ctx, %v): %v", args, err)
|
||||
http.Error(w, http.StatusText(http.StatusInternalServerError), http.StatusInternalServerError)
|
||||
return
|
||||
}
|
||||
|
|
Загрузка…
Ссылка в новой задаче