internal/postgres: change search tokens

Change the way that the Postgres ts_vector (list of search tokens) is computed. - Use the path_tokens text configuration when creating the tsvector for the path. - Construct sections B and C of the search document by combining the synopsis part of the README. Parts of this processing are: - Extract only the text of a markdown README, to remove images and other extraneous information. - Add alternatives to certain words in the synopsis and README. For example, add "postgresql" whenever we see "postgres". - Modify the ts_rank call in the code to use a B weight of 1. - Change the call to the database search function so that it invokes the function that has a B weight of 1. These changes will require re-computing the search_documents.tsv_search_tokens column. That should be done after these are deployed. Change-Id: Ib81601326f11efd81c8bc733694a000eccecf12b Reviewed-on: https://team-review.git.corp.google.com/c/golang/discovery/+/705958 Reviewed-by: Julie Qiu <julieqiu@google.com>
2020-03-30 17:18:52 -04:00 · 2020-03-30 17:18:52 -04:00 · ba9b5823bf
--- a/internal/postgres/insert_version.go
+++ b/internal/postgres/insert_version.go
@ -75,13 +75,8 @@ func (db *DB) InsertModule(ctx context.Context, m *internal.Module) (err error)
 		return err
 	}

-	// Insert the module's non-internal packages into search_documents.
-	for _, pkg := range m.Packages {
-		if err := db.UpsertSearchDocument(ctx, pkg.Path); err != nil {
-			return err
-		}
-	}
-	return nil
+	// Insert the module's packages into search_documents.
+	return db.UpsertSearchDocuments(ctx, m)
 }

 // saveModule inserts a Module into the database along with its packages,
@ -140,7 +135,7 @@ func (db *DB) saveModule(ctx context.Context, m *internal.Module) error {
 			m.Version,
 			m.CommitTime,
 			m.ReadmeFilePath,
-			makeValidUnicode([]byte(m.ReadmeContents)),
+			makeValidUnicode(m.ReadmeContents),
 			version.ForSorting(m.Version),
 			m.VersionType,
 			m.SeriesPath(),
@ -158,7 +153,7 @@ func (db *DB) saveModule(ctx context.Context, m *internal.Module) error {
 				return fmt.Errorf("marshalling %+v: %v", l.Coverage, err)
 			}
 			licenseValues = append(licenseValues, m.ModulePath, m.Version,
-				l.FilePath, makeValidUnicode(l.Contents), pq.Array(l.Types), covJSON)
+				l.FilePath, makeValidUnicode(string(l.Contents)), pq.Array(l.Types), covJSON)
 		}
 		if len(licenseValues) > 0 {
 			licenseCols := []string{
@ -366,11 +361,11 @@ func (db *DB) DeleteModule(ctx context.Context, ddb *database.DB, modulePath, ve
 	return err
 }

-// makeValidUnicode removes null runes from license contents, because pq doesn't like them.
-// Also, replace non-unicode characters with the Unicode replacement character, which is
-// the behavior of for ... range on strings.
-func makeValidUnicode(bs []byte) string {
-	s := string(bs)
+// makeValidUnicode removes null runes from a string that will be saved in a
+// column of type TEXT, because pq doesn't like them. It also replaces non-unicode
+// characters with the Unicode replacement character, which is the behavior of
+// for ... range on strings.
+func makeValidUnicode(s string) string {
 	var b strings.Builder
 	for _, r := range s {
 		if r != 0 {
--- a/internal/postgres/insert_version_test.go
+++ b/internal/postgres/insert_version_test.go
@ -323,7 +323,7 @@ func TestMakeValidUnicode(t *testing.T) {
 		if (err == nil) != okRaw {
 			t.Errorf("%s, raw: got %v, want error: %t", filename, err, okRaw)
 		}
-		if err := insert(makeValidUnicode(data)); err != nil {
+		if err := insert(makeValidUnicode(string(data))); err != nil {
 			t.Errorf("%s, after making valid: %v", filename, err)
 		}
 	}
--- a/internal/postgres/search.go
+++ b/internal/postgres/search.go
@ -9,7 +9,6 @@ import (
 	"database/sql"
 	"fmt"
 	"math"
-	"sort"
 	"strings"
 	"time"

@ -140,8 +139,11 @@ const (
 //   dramatic: being 2x as popular only has an additive effect.
 // - A penalty factor for non-redistributable modules, since a lot of
 //   details cannot be displayed.
+// The first argument to ts_rank is an array of weights for the four tsvector sections,
+// in the order D, C, B, A.
+// The weights below match the defaults except for B.
 var scoreExpr = fmt.Sprintf(`
-		ts_rank(tsv_search_tokens, websearch_to_tsquery($1)) *
+		ts_rank('{0.1, 0.2, 1.0, 1.0}', tsv_search_tokens, websearch_to_tsquery($1)) *
 		ln(exp(1)+imported_by_count) *
 		CASE WHEN redistributable THEN 1 ELSE %f END *
 		CASE WHEN COALESCE(has_go_mod, true) THEN 1 ELSE %f END
@ -392,7 +394,7 @@ func (db *DB) popularSearch(ctx context.Context, searchQuery string, limit, offs
 			commit_time,
 			imported_by_count,
 			score
-		FROM popular_search_go_mod($1, $2, $3, $4, $5)`
+		FROM popular_search($1, $2, $3, $4, $5)`
 	var results []*internal.SearchResult
 	collect := func(rows *sql.Rows) error {
 		var r internal.SearchResult
@ -496,13 +498,10 @@ var upsertSearchStatement = fmt.Sprintf(`
 		m.commit_time,
 		m.has_go_mod,
 		(
-			SETWEIGHT(TO_TSVECTOR($2), 'A') ||
-			-- Try to limit to the maximum length of a tsvector.
-			-- This is just a guess, since the max length is in bytes and there
-			-- doesn't seem to be a way to determine the number of bytes in a tsvector.
-			-- Since the max is 1048575, make sure part is half that size.
-			SETWEIGHT(TO_TSVECTOR(left(p.synopsis, 1048575/2)), 'B') ||
-			SETWEIGHT(TO_TSVECTOR(left(m.readme_contents, 1048575/2)), 'C')
+			SETWEIGHT(TO_TSVECTOR('path_tokens', $2), 'A') ||
+			SETWEIGHT(TO_TSVECTOR($3), 'B') ||
+			SETWEIGHT(TO_TSVECTOR($4), 'C') ||
+			SETWEIGHT(TO_TSVECTOR($5), 'D')
 		),
 		hll_hash(p.path) & (%[1]d - 1),
 		hll_zeros(hll_hash(p.path))
@ -544,52 +543,86 @@ var upsertSearchStatement = fmt.Sprintf(`
 			END)
 	;`, hllRegisterCount)

+// UpsertSearchDocuments adds search information for mod ot the search_documents table.
+func (db *DB) UpsertSearchDocuments(ctx context.Context, mod *internal.Module) (err error) {
+	defer derrors.Wrap(&err, "UpsertSearchDocuments(ctx, %q)", mod.ModulePath)
+
+	for _, pkg := range mod.Packages {
+		if isInternalPackage(pkg.Path) {
+			continue
+		}
+		err := db.UpsertSearchDocument(ctx, upsertSearchDocumentArgs{
+			PackagePath:    pkg.Path,
+			ModulePath:     mod.ModulePath,
+			Synopsis:       pkg.Synopsis,
+			ReadmeFilePath: mod.ReadmeFilePath,
+			ReadmeContents: mod.ReadmeContents,
+		})
+		if err != nil {
+			return err
+		}
+	}
+	return nil
+}
+
+type upsertSearchDocumentArgs struct {
+	PackagePath    string
+	ModulePath     string
+	Synopsis       string
+	ReadmeFilePath string
+	ReadmeContents string
+}
+
 // UpsertSearchDocument inserts a row for each package in the module, if that
 // package is the latest version and is not internal.
 //
 // The given module should have already been validated via a call to
 // validateModule.
-func (db *DB) UpsertSearchDocument(ctx context.Context, path string) (err error) {
-	defer derrors.Wrap(&err, "UpsertSearchDocument(ctx, %q)", path)
+func (db *DB) UpsertSearchDocument(ctx context.Context, args upsertSearchDocumentArgs) (err error) {
+	defer derrors.Wrap(&err, "UpsertSearchDocument(ctx, %q, %q)", args.PackagePath, args.ModulePath)

-	if isInternalPackage(path) {
-		return nil
+	// Only summarize the README if the package and module have the same path.
+	if args.PackagePath != args.ModulePath {
+		args.ReadmeFilePath = ""
+		args.ReadmeContents = ""
 	}
-	pathTokens := strings.Join(GeneratePathTokens(path), " ")
-	_, err = db.db.Exec(ctx, upsertSearchStatement, path, pathTokens)
+	pathTokens := strings.Join(GeneratePathTokens(args.PackagePath), " ")
+	sectionB, sectionC, sectionD := SearchDocumentSections(args.Synopsis, args.ReadmeFilePath, args.ReadmeContents)
+	_, err = db.db.Exec(ctx, upsertSearchStatement, args.PackagePath, pathTokens, sectionB, sectionC, sectionD)
 	return err
 }

 // GetPackagesForSearchDocumentUpsert fetches all paths from packages that do
 // not exist in search_documents.
-func (db *DB) GetPackagesForSearchDocumentUpsert(ctx context.Context, limit int) (paths []string, err error) {
+func (db *DB) GetPackagesForSearchDocumentUpsert(ctx context.Context, limit int) (argsList []upsertSearchDocumentArgs, err error) {
 	defer derrors.Add(&err, "GetPackagesForSearchDocumentUpsert(ctx, %d)", limit)

 	query := `
-		SELECT DISTINCT(path)
+		SELECT DISTINCT ON (p.path) p.path, m.module_path, p.synopsis, m.readme_file_path, m.readme_contents
 		FROM packages p
+		INNER JOIN modules m
+		USING (module_path, version)
 		LEFT JOIN search_documents sd
 		ON p.path = sd.package_path
 		WHERE sd.package_path IS NULL
 		LIMIT $1`

 	collect := func(rows *sql.Rows) error {
-		var path string
-		if err := rows.Scan(&path); err != nil {
+		var a upsertSearchDocumentArgs
+		if err := rows.Scan(&a.PackagePath, &a.ModulePath, &a.Synopsis, &a.ReadmeFilePath, &a.ReadmeContents); err != nil {
 			return err
 		}
 		// Filter out packages in internal directories, since
 		// they are skipped when upserting search_documents.
-		if !isInternalPackage(path) {
-			paths = append(paths, path)
+		if !isInternalPackage(a.PackagePath) {
+			argsList = append(argsList, a)
 		}
 		return nil
 	}
 	if err := db.db.RunQuery(ctx, query, collect, limit); err != nil {
 		return nil, err
 	}
-	sort.Strings(paths)
-	return paths, nil
+	return argsList, nil
 }

 // UpdateSearchDocumentsImportedByCount updates imported_by_count and
--- a/internal/postgres/search_test.go
+++ b/internal/postgres/search_test.go
@ -369,6 +369,12 @@ func TestInsertSearchDocumentAndSearch(t *testing.T) {
 		}
 	)

+	const (
+		packageScore  = 0.6079270839691162
+		goAndCDKScore = 0.999817967414856
+		cloudScore    = 0.8654518127441406
+	)
+
 	for _, tc := range []struct {
 		name          string
 		packages      map[string]*internal.Package
@ -384,8 +390,8 @@ func TestInsertSearchDocumentAndSearch(t *testing.T) {
 				modKube:  pkgKube,
 			},
 			want: []*internal.SearchResult{
-				goCdkResult(0.2431708425283432, 2),
-				kubeResult(0.2431708425283432, 2),
+				goCdkResult(packageScore, 2),
+				kubeResult(packageScore, 2),
 			},
 		},
 		{
@ -398,7 +404,7 @@ func TestInsertSearchDocumentAndSearch(t *testing.T) {
 				modGoCDK: pkgGoCDK,
 			},
 			want: []*internal.SearchResult{
-				goCdkResult(0.2431708425283432, 2),
+				goCdkResult(packageScore, 2),
 			},
 		},
 		{
@ -411,7 +417,7 @@ func TestInsertSearchDocumentAndSearch(t *testing.T) {
 				modKube:  pkgKube,
 			},
 			want: []*internal.SearchResult{
-				kubeResult(0.2431708425283432, 2),
+				kubeResult(packageScore, 2),
 			},
 		},
 		{
@ -422,7 +428,7 @@ func TestInsertSearchDocumentAndSearch(t *testing.T) {
 				modKube:  pkgKube,
 			},
 			want: []*internal.SearchResult{
-				goCdkResult(0.733867883682251, 1),
+				goCdkResult(goAndCDKScore, 1),
 			},
 		},
 		{
@ -432,7 +438,7 @@ func TestInsertSearchDocumentAndSearch(t *testing.T) {
 				modGoCDK: pkgGoCDK,
 			},
 			want: []*internal.SearchResult{
-				goCdkResult(0.7109370231628418, 1),
+				goCdkResult(cloudScore, 1),
 			},
 		},
 	} {
@ -820,28 +826,42 @@ func TestGetPackagesForSearchDocumentUpsert(t *testing.T) {
 	}
 	// pkgPaths should be "A", since pkg "A" exists in packages but not
 	// search_documents.
-	pkgPaths, err := testDB.GetPackagesForSearchDocumentUpsert(ctx, 10)
+	got, err := testDB.GetPackagesForSearchDocumentUpsert(ctx, 10)
 	if err != nil {
 		t.Fatal(err)
 	}
-	want := []string{"A", "A/notinternal"}
-	if diff := cmp.Diff(want, pkgPaths); diff != "" {
+	sort.Slice(got, func(i, j int) bool { return got[i].PackagePath < got[j].PackagePath })
+	want := []upsertSearchDocumentArgs{
+		{
+			PackagePath:    "A",
+			ModulePath:     moduleA.ModulePath,
+			ReadmeFilePath: "README.md",
+			ReadmeContents: "readme",
+		},
+		{
+			PackagePath:    "A/notinternal",
+			ModulePath:     moduleA.ModulePath,
+			ReadmeFilePath: "README.md",
+			ReadmeContents: "readme",
+		},
+	}
+	if diff := cmp.Diff(want, got); diff != "" {
 		t.Fatalf("testDB.GetPackagesForSearchDocumentUpsert mismatch(-want +got):\n%s", diff)
 	}

-	for _, path := range want {
-		if err := testDB.UpsertSearchDocument(ctx, path); err != nil {
+	for _, args := range got {
+		if err := testDB.UpsertSearchDocument(ctx, args); err != nil {
 			t.Fatal(err)
 		}
 	}
 	// pkgPaths should be an empty slice, since pkg "A" and "A/notinternal"
 	// were just inserted into search_documents.
-	pkgPaths, err = testDB.GetPackagesForSearchDocumentUpsert(ctx, 10)
+	got, err = testDB.GetPackagesForSearchDocumentUpsert(ctx, 10)
 	if err != nil {
 		t.Fatal(err)
 	}
-	if len(pkgPaths) != 0 {
-		t.Fatalf("expected testDB.GetPackagesForSearchDocumentUpsert to return an empty slice; got %v", pkgPaths)
+	if len(got) != 0 {
+		t.Fatalf("expected testDB.GetPackagesForSearchDocumentUpsert to return an empty slice; got %v", got)
 	}
 }

--- a/internal/postgres/searchdoc.go
+++ b/internal/postgres/searchdoc.go
@ -0,0 +1,213 @@
+// Copyright 2020 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package postgres
+
+import (
+	"path/filepath"
+	"strings"
+	"unicode"
+
+	"github.com/russross/blackfriday/v2"
+)
+
+const (
+	maxSectionWords   = 50
+	maxReadmeFraction = 0.5
+)
+
+// SearchDocumentSections computes the B and C sections of a Postgres search
+// document from a package synopsis and a README.
+// By "B section" and "C section" we mean the portion of the tsvector with weight
+// "B" and "C", respectively.
+//
+// The B section consists of the synopsis.
+// The C section consists of the first sentence of the README.
+// The D section consists of the remainder of the README.
+// All sections are split into words and processed for replacements.
+// Each section is limited to maxSectionWords words, and in addition the
+// D section is limited to an initial fraction of the README, determined
+// by maxReadmeFraction.
+func SearchDocumentSections(synopsis, readmeFilename, readme string) (b, c, d string) {
+	return searchDocumentSections(synopsis, readmeFilename, readme, maxSectionWords, maxReadmeFraction)
+}
+
+func searchDocumentSections(synopsis, readmeFilename, readme string, maxSecWords int, maxReadmeFrac float64) (b, c, d string) {
+	var readmeFirst, readmeRest string
+	if isMarkdown(readmeFilename) {
+		readme = processMarkdown(readme)
+	}
+	if i := sentenceEndIndex(readme); i > 0 {
+		readmeFirst, readmeRest = readme[:i+1], readme[i+1:]
+	} else {
+		readmeRest = readme
+	}
+	sw := processWords(synopsis)
+	rwf := processWords(readmeFirst)
+	rwr := processWords(readmeRest)
+
+	sectionB, _ := split(sw, maxSecWords)
+	sectionC, rwfd := split(rwf, maxSecWords)
+	// section D is the part of the readme that is not in sectionC.
+	rwd := append(rwfd, rwr...)
+	// Keep maxSecWords of section D, but not more than maxReadmeFrac.
+	f := int(maxReadmeFrac * float64(len(rwd)))
+	nkeep := maxSecWords
+	if nkeep > f {
+		nkeep = f
+	}
+	sectionD, _ := split(rwd, nkeep)
+
+	// If there is no synopsis, use first sentence of the README.
+	// But do not promote the rest of the README to section C.
+	if len(sectionB) == 0 {
+		sectionB = sectionC
+		sectionC = nil
+	}
+
+	prep := func(ws []string) string {
+		return makeValidUnicode(strings.Join(ws, " "))
+	}
+
+	return prep(sectionB), prep(sectionC), prep(sectionD)
+}
+
+// split splits a slice of strings into two parts. The first has length <= n,
+// and the second is the rest of the slice. If n is negative, the first part is nil and
+// the second part is the entire slice.
+func split(a []string, n int) ([]string, []string) {
+	if n >= len(a) {
+		return a, nil
+	}
+	return a[:n], a[n:]
+}
+
+// sentenceEndIndex returns the index in s of the end of the first sentence, or
+// -1 if no end can be found. A sentence ends at a '.', '!' or '?' that is
+// followed by a space (or ends the string), and is not preceeded by an
+// uppercase letter.
+func sentenceEndIndex(s string) int {
+	var prev1, prev2 rune
+
+	end := func() bool {
+		return !unicode.IsUpper(prev2) && (prev1 == '.' || prev1 == '!' || prev1 == '?')
+	}
+
+	for i, r := range s {
+		if unicode.IsSpace(r) && end() {
+			return i - 1
+		}
+		prev2 = prev1
+		prev1 = r
+	}
+	if end() {
+		return len(s) - 1
+	}
+	return -1
+}
+
+// processWords splits s into words at whitespace, then processes each word.
+func processWords(s string) []string {
+	fields := strings.Fields(strings.ToLower(s))
+	var words []string
+	for _, f := range fields {
+		words = append(words, processWord(f)...)
+	}
+	return words
+}
+
+// summaryReplacements is used to replace words with other words.
+// It is used by processWord, below.
+// Example key-value pairs:
+//   "deleteMe": nil					 // removes "deleteMe"
+//   "rand": []string{"random"}			 // replace "rand" with "random"
+//   "utf-8": []string{"utf-8", "utf8"}  // add "utf8" whenever "utf-8" is seen
+var summaryReplacements = map[string][]string{
+	"postgres":   []string{"postgres", "postgresql"},
+	"postgresql": []string{"postgres", "postgresql"},
+	"rand":       []string{"random"},
+	"mongo":      []string{"mongo", "mongodb"},
+	"mongodb":    []string{"mongo", "mongodb"},
+	"redis":      []string{"redis", "redisdb"},
+	"redisdb":    []string{"redis", "redisdb"},
+	"logger":     []string{"logger", "log"}, // Postgres stemmer does not handle -er
+	"parser":     []string{"parser", "parse"},
+	"utf-8":      []string{"utf-8", "utf8"},
+}
+
+// processWord performs processing on s, returning zero or more words.
+// Its main purpose is to apply summaryReplacements to replace
+// certain words with synonyms or additional search terms.
+func processWord(s string) []string {
+	s = strings.TrimFunc(s, unicode.IsPunct)
+	if s == "" {
+		return nil
+	}
+	if rs, ok := summaryReplacements[s]; ok {
+		return rs
+	}
+	if !hyphenSplit(s) {
+		return []string{s}
+	}
+	// Apply replacements to parts of hyphenated words.
+	ws := strings.Split(s, "-")
+	if len(ws) == 1 {
+		return ws
+	}
+	result := []string{s} // Include the full hyphenated word.
+	for _, w := range ws {
+		if rs, ok := summaryReplacements[w]; ok {
+			result = append(result, rs...)
+		}
+		// We don't need to include the parts; the Postgres text-search processor will do that.
+	}
+	return result
+}
+
+// hyphenSplit reports whether s should be split on hyphens.
+func hyphenSplit(s string) bool {
+	return !(strings.HasPrefix(s, "http://") || strings.HasPrefix(s, "https://"))
+}
+
+// isMarkdown reports whether filename says that the file contains markdown.
+func isMarkdown(filename string) bool {
+	ext := strings.ToLower(filepath.Ext(filename))
+	// https://tools.ietf.org/html/rfc7763 mentions both extensions.
+	return ext == ".md" || ext == ".markdown"
+}
+
+// processMarkdown returns the text of a markdown document.
+// It omits all formatting and images.
+func processMarkdown(s string) string {
+	parser := blackfriday.New(blackfriday.WithExtensions(blackfriday.CommonExtensions))
+	root := parser.Parse([]byte(s))
+	buf := walkMarkdown(root, nil, 0)
+	return string(buf)
+}
+
+// walkMarkdown traverses a blackfriday parse tree, extracting text.
+func walkMarkdown(n *blackfriday.Node, buf []byte, level int) []byte {
+	if n == nil {
+		return buf
+	}
+	switch n.Type {
+	case blackfriday.Image:
+		// Skip images because they usually are irrelevant to the package
+		// (badges and such).
+		return buf
+	case blackfriday.CodeBlock:
+		// Skip code blocks because they have a wide variety of unrelated symbols.
+		return buf
+	case blackfriday.Paragraph, blackfriday.Heading:
+		if len(buf) > 0 {
+			buf = append(buf, ' ')
+		}
+	default:
+		buf = append(buf, n.Literal...)
+	}
+	for c := n.FirstChild; c != nil; c = c.Next {
+		buf = walkMarkdown(c, buf, level+1)
+	}
+	return buf
+}
--- a/internal/postgres/searchdoc_test.go
+++ b/internal/postgres/searchdoc_test.go
@ -0,0 +1,133 @@
+// Copyright 2020 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package postgres
+
+import (
+	"testing"
+
+	"github.com/google/go-cmp/cmp"
+)
+
+func TestSearchDocumentSections(t *testing.T) {
+	for _, test := range []struct {
+		name                string
+		synopsis            string
+		readmeFilename      string
+		readmeContents      string
+		wantB, wantC, wantD string
+	}{
+		{
+			"blackfriday",
+			"This is a synopsis.",
+			"foo.md",
+			`Package blackfriday is a [markdown](http://foo) processor. That _is_ all that it is.`,
+
+			"this is a synopsis",
+			"package blackfriday is a markdown processor",
+			"that is all",
+		},
+		{
+			"non-markdown",
+			"This synopsis is too long so we'll truncate it.",
+			"README",
+			"This README doesn't have a sentence end so the whole thing is D",
+
+			"this synopsis is too long so",
+			"",
+			"this readme doesn't have a sentence",
+		},
+		{
+			"viper",
+			"",
+			"README.md",
+			`
+![viper logo](https://cloud.githubusercontent.com/assets/173412/10886745/998df88a-8151-11e5-9448-4736db51020d.png)
+
+Go configuration with fangs!
+
+[![Actions](https://github.com/spf13/viper/workflows/CI/badge.svg)](https://github.com/spf13/viper)
+[![Join the chat at https://gitter.im/spf13/viper](https://badges.gitter.im/Join%20Chat.svg)](https://gitter.im/spf13/viper?utm_source=badge&utm_medium=badge&utm_campaign=pr-badge&utm_content=badge)
+[![GoDoc](https://godoc.org/github.com/spf13/viper?status.svg)](https://godoc.org/github.com/spf13/viper)
+
+Many Go projects are built using Viper including:`,
+
+			"go configuration with fangs", // first sentence of README promoted
+			"",
+			"many go projects are",
+		},
+	} {
+		gotB, gotC, gotD := searchDocumentSections(test.synopsis, test.readmeFilename, test.readmeContents, 6, 0.5)
+		if gotB != test.wantB {
+			t.Errorf("%s, B: got %q, want %q", test.name, gotB, test.wantB)
+		}
+		if gotC != test.wantC {
+			t.Errorf("%s, C: got %q, want %q", test.name, gotC, test.wantC)
+		}
+		if gotD != test.wantD {
+			t.Errorf("%s, D: got %q, want %q", test.name, gotD, test.wantD)
+		}
+	}
+}
+
+func TestProcessWords(t *testing.T) {
+	for _, test := range []struct {
+		in   string
+		want []string
+	}{
+		{"", nil},
+		{"foo", []string{"foo"}},
+		{" foo \t bar\n", []string{"foo", "bar"}},
+		{"http://foo/bar/baz?x=1", []string{"http://foo/bar/baz?x=1"}},
+		{"This, however, shall. not; stand?", []string{"this", "however", "shall", "not", "stand"}},
+		{"a postgres and NATS server over HTTP", []string{
+			"a", "postgres", "postgresql", "and", "nats", "server", "over", "http"}},
+		{"http://a-b-c.com full-text chart-parser", []string{
+			"http://a-b-c.com", "full-text", "chart-parser", "parser", "parse"}},
+	} {
+		got := processWords(test.in)
+		if !cmp.Equal(got, test.want) {
+			t.Errorf("%q:\ngot  %#v\nwant %#v", test.in, got, test.want)
+		}
+	}
+}
+
+func TestProcessMarkdown(t *testing.T) {
+	const (
+		in = `
+Blackfriday [![Build Status](https://travis-ci.org/russross/blackfriday.svg?branch=master)](https://travis-ci.org/russross/blackfriday)
+===========
+
+_Blackfriday_ is a [Markdown][1] *processor* implemented in [Go](https://golang.org).
+
+[1]: https://daringfireball.net/projects/markdown/ "Markdown"
+`
+
+		want = `Blackfriday  Blackfriday is a Markdown processor implemented in Go.`
+	)
+
+	got := processMarkdown(in)
+	if got != want {
+		t.Errorf("got\n%s\nwant\n%s", got, want)
+	}
+}
+
+func TestSentenceEndIndex(t *testing.T) {
+	for _, test := range []struct {
+		in   string
+		want int
+	}{
+		{"", -1},
+		{"Hello. What's up?", 5},
+		{"unicode π∆!", 13},
+		{"D. C. Fontana?", 13},
+		{"D. c. Fontana?", 4},
+		{"no end", -1},
+	} {
+		got := sentenceEndIndex(test.in)
+		if got != test.want {
+			t.Errorf("%s: got %d, want %d", test.in, got, test.want)
+		}
+	}
+}
--- a/internal/worker/server.go
+++ b/internal/worker/server.go
@ -163,16 +163,16 @@ func (s *Server) handlePopulateSearchDocuments(w http.ResponseWriter, r *http.Re
 	limit := parseIntParam(r, "limit", 100)
 	ctx := r.Context()
 	log.Infof(ctx, "Populating search documents for %d packages", limit)
-	pkgPaths, err := s.db.GetPackagesForSearchDocumentUpsert(ctx, limit)
+	sdargs, err := s.db.GetPackagesForSearchDocumentUpsert(ctx, limit)
 	if err != nil {
 		log.Errorf(ctx, "s.db.GetPackagesSearchDocumentUpsert(ctx): %v", err)
 		http.Error(w, http.StatusText(http.StatusInternalServerError), http.StatusInternalServerError)
 		return
 	}

-	for _, path := range pkgPaths {
-		if err := s.db.UpsertSearchDocument(ctx, path); err != nil {
-			log.Errorf(ctx, "s.db.UpsertSearchDocument(ctx, %q): %v", path, err)
+	for _, args := range sdargs {
+		if err := s.db.UpsertSearchDocument(ctx, args); err != nil {
+			log.Errorf(ctx, "s.db.UpsertSearchDocument(ctx, %v): %v", args, err)
 			http.Error(w, http.StatusText(http.StatusInternalServerError), http.StatusInternalServerError)
 			return
 		}