From ba9b5823bf00a1fd5e754b72936473a92514ebe2 Mon Sep 17 00:00:00 2001 From: Jonathan Amsterdam Date: Mon, 30 Mar 2020 17:18:52 -0400 Subject: [PATCH] internal/postgres: change search tokens Change the way that the Postgres ts_vector (list of search tokens) is computed. - Use the path_tokens text configuration when creating the tsvector for the path. - Construct sections B and C of the search document by combining the synopsis part of the README. Parts of this processing are: - Extract only the text of a markdown README, to remove images and other extraneous information. - Add alternatives to certain words in the synopsis and README. For example, add "postgresql" whenever we see "postgres". - Modify the ts_rank call in the code to use a B weight of 1. - Change the call to the database search function so that it invokes the function that has a B weight of 1. These changes will require re-computing the search_documents.tsv_search_tokens column. That should be done after these are deployed. Change-Id: Ib81601326f11efd81c8bc733694a000eccecf12b Reviewed-on: https://team-review.git.corp.google.com/c/golang/discovery/+/705958 Reviewed-by: Julie Qiu --- internal/postgres/insert_version.go | 23 +-- internal/postgres/insert_version_test.go | 2 +- internal/postgres/search.go | 81 ++++++--- internal/postgres/search_test.go | 48 +++-- internal/postgres/searchdoc.go | 213 +++++++++++++++++++++++ internal/postgres/searchdoc_test.go | 133 ++++++++++++++ internal/worker/server.go | 8 +- 7 files changed, 451 insertions(+), 57 deletions(-) create mode 100644 internal/postgres/searchdoc.go create mode 100644 internal/postgres/searchdoc_test.go diff --git a/internal/postgres/insert_version.go b/internal/postgres/insert_version.go index 5ba60fcd..908eefb6 100644 --- a/internal/postgres/insert_version.go +++ b/internal/postgres/insert_version.go @@ -75,13 +75,8 @@ func (db *DB) InsertModule(ctx context.Context, m *internal.Module) (err error) return err } - // Insert the module's non-internal packages into search_documents. - for _, pkg := range m.Packages { - if err := db.UpsertSearchDocument(ctx, pkg.Path); err != nil { - return err - } - } - return nil + // Insert the module's packages into search_documents. + return db.UpsertSearchDocuments(ctx, m) } // saveModule inserts a Module into the database along with its packages, @@ -140,7 +135,7 @@ func (db *DB) saveModule(ctx context.Context, m *internal.Module) error { m.Version, m.CommitTime, m.ReadmeFilePath, - makeValidUnicode([]byte(m.ReadmeContents)), + makeValidUnicode(m.ReadmeContents), version.ForSorting(m.Version), m.VersionType, m.SeriesPath(), @@ -158,7 +153,7 @@ func (db *DB) saveModule(ctx context.Context, m *internal.Module) error { return fmt.Errorf("marshalling %+v: %v", l.Coverage, err) } licenseValues = append(licenseValues, m.ModulePath, m.Version, - l.FilePath, makeValidUnicode(l.Contents), pq.Array(l.Types), covJSON) + l.FilePath, makeValidUnicode(string(l.Contents)), pq.Array(l.Types), covJSON) } if len(licenseValues) > 0 { licenseCols := []string{ @@ -366,11 +361,11 @@ func (db *DB) DeleteModule(ctx context.Context, ddb *database.DB, modulePath, ve return err } -// makeValidUnicode removes null runes from license contents, because pq doesn't like them. -// Also, replace non-unicode characters with the Unicode replacement character, which is -// the behavior of for ... range on strings. -func makeValidUnicode(bs []byte) string { - s := string(bs) +// makeValidUnicode removes null runes from a string that will be saved in a +// column of type TEXT, because pq doesn't like them. It also replaces non-unicode +// characters with the Unicode replacement character, which is the behavior of +// for ... range on strings. +func makeValidUnicode(s string) string { var b strings.Builder for _, r := range s { if r != 0 { diff --git a/internal/postgres/insert_version_test.go b/internal/postgres/insert_version_test.go index 7144cb86..4f1bd066 100644 --- a/internal/postgres/insert_version_test.go +++ b/internal/postgres/insert_version_test.go @@ -323,7 +323,7 @@ func TestMakeValidUnicode(t *testing.T) { if (err == nil) != okRaw { t.Errorf("%s, raw: got %v, want error: %t", filename, err, okRaw) } - if err := insert(makeValidUnicode(data)); err != nil { + if err := insert(makeValidUnicode(string(data))); err != nil { t.Errorf("%s, after making valid: %v", filename, err) } } diff --git a/internal/postgres/search.go b/internal/postgres/search.go index 0c288308..57106320 100644 --- a/internal/postgres/search.go +++ b/internal/postgres/search.go @@ -9,7 +9,6 @@ import ( "database/sql" "fmt" "math" - "sort" "strings" "time" @@ -140,8 +139,11 @@ const ( // dramatic: being 2x as popular only has an additive effect. // - A penalty factor for non-redistributable modules, since a lot of // details cannot be displayed. +// The first argument to ts_rank is an array of weights for the four tsvector sections, +// in the order D, C, B, A. +// The weights below match the defaults except for B. var scoreExpr = fmt.Sprintf(` - ts_rank(tsv_search_tokens, websearch_to_tsquery($1)) * + ts_rank('{0.1, 0.2, 1.0, 1.0}', tsv_search_tokens, websearch_to_tsquery($1)) * ln(exp(1)+imported_by_count) * CASE WHEN redistributable THEN 1 ELSE %f END * CASE WHEN COALESCE(has_go_mod, true) THEN 1 ELSE %f END @@ -392,7 +394,7 @@ func (db *DB) popularSearch(ctx context.Context, searchQuery string, limit, offs commit_time, imported_by_count, score - FROM popular_search_go_mod($1, $2, $3, $4, $5)` + FROM popular_search($1, $2, $3, $4, $5)` var results []*internal.SearchResult collect := func(rows *sql.Rows) error { var r internal.SearchResult @@ -496,13 +498,10 @@ var upsertSearchStatement = fmt.Sprintf(` m.commit_time, m.has_go_mod, ( - SETWEIGHT(TO_TSVECTOR($2), 'A') || - -- Try to limit to the maximum length of a tsvector. - -- This is just a guess, since the max length is in bytes and there - -- doesn't seem to be a way to determine the number of bytes in a tsvector. - -- Since the max is 1048575, make sure part is half that size. - SETWEIGHT(TO_TSVECTOR(left(p.synopsis, 1048575/2)), 'B') || - SETWEIGHT(TO_TSVECTOR(left(m.readme_contents, 1048575/2)), 'C') + SETWEIGHT(TO_TSVECTOR('path_tokens', $2), 'A') || + SETWEIGHT(TO_TSVECTOR($3), 'B') || + SETWEIGHT(TO_TSVECTOR($4), 'C') || + SETWEIGHT(TO_TSVECTOR($5), 'D') ), hll_hash(p.path) & (%[1]d - 1), hll_zeros(hll_hash(p.path)) @@ -544,52 +543,86 @@ var upsertSearchStatement = fmt.Sprintf(` END) ;`, hllRegisterCount) +// UpsertSearchDocuments adds search information for mod ot the search_documents table. +func (db *DB) UpsertSearchDocuments(ctx context.Context, mod *internal.Module) (err error) { + defer derrors.Wrap(&err, "UpsertSearchDocuments(ctx, %q)", mod.ModulePath) + + for _, pkg := range mod.Packages { + if isInternalPackage(pkg.Path) { + continue + } + err := db.UpsertSearchDocument(ctx, upsertSearchDocumentArgs{ + PackagePath: pkg.Path, + ModulePath: mod.ModulePath, + Synopsis: pkg.Synopsis, + ReadmeFilePath: mod.ReadmeFilePath, + ReadmeContents: mod.ReadmeContents, + }) + if err != nil { + return err + } + } + return nil +} + +type upsertSearchDocumentArgs struct { + PackagePath string + ModulePath string + Synopsis string + ReadmeFilePath string + ReadmeContents string +} + // UpsertSearchDocument inserts a row for each package in the module, if that // package is the latest version and is not internal. // // The given module should have already been validated via a call to // validateModule. -func (db *DB) UpsertSearchDocument(ctx context.Context, path string) (err error) { - defer derrors.Wrap(&err, "UpsertSearchDocument(ctx, %q)", path) +func (db *DB) UpsertSearchDocument(ctx context.Context, args upsertSearchDocumentArgs) (err error) { + defer derrors.Wrap(&err, "UpsertSearchDocument(ctx, %q, %q)", args.PackagePath, args.ModulePath) - if isInternalPackage(path) { - return nil + // Only summarize the README if the package and module have the same path. + if args.PackagePath != args.ModulePath { + args.ReadmeFilePath = "" + args.ReadmeContents = "" } - pathTokens := strings.Join(GeneratePathTokens(path), " ") - _, err = db.db.Exec(ctx, upsertSearchStatement, path, pathTokens) + pathTokens := strings.Join(GeneratePathTokens(args.PackagePath), " ") + sectionB, sectionC, sectionD := SearchDocumentSections(args.Synopsis, args.ReadmeFilePath, args.ReadmeContents) + _, err = db.db.Exec(ctx, upsertSearchStatement, args.PackagePath, pathTokens, sectionB, sectionC, sectionD) return err } // GetPackagesForSearchDocumentUpsert fetches all paths from packages that do // not exist in search_documents. -func (db *DB) GetPackagesForSearchDocumentUpsert(ctx context.Context, limit int) (paths []string, err error) { +func (db *DB) GetPackagesForSearchDocumentUpsert(ctx context.Context, limit int) (argsList []upsertSearchDocumentArgs, err error) { defer derrors.Add(&err, "GetPackagesForSearchDocumentUpsert(ctx, %d)", limit) query := ` - SELECT DISTINCT(path) + SELECT DISTINCT ON (p.path) p.path, m.module_path, p.synopsis, m.readme_file_path, m.readme_contents FROM packages p + INNER JOIN modules m + USING (module_path, version) LEFT JOIN search_documents sd ON p.path = sd.package_path WHERE sd.package_path IS NULL LIMIT $1` collect := func(rows *sql.Rows) error { - var path string - if err := rows.Scan(&path); err != nil { + var a upsertSearchDocumentArgs + if err := rows.Scan(&a.PackagePath, &a.ModulePath, &a.Synopsis, &a.ReadmeFilePath, &a.ReadmeContents); err != nil { return err } // Filter out packages in internal directories, since // they are skipped when upserting search_documents. - if !isInternalPackage(path) { - paths = append(paths, path) + if !isInternalPackage(a.PackagePath) { + argsList = append(argsList, a) } return nil } if err := db.db.RunQuery(ctx, query, collect, limit); err != nil { return nil, err } - sort.Strings(paths) - return paths, nil + return argsList, nil } // UpdateSearchDocumentsImportedByCount updates imported_by_count and diff --git a/internal/postgres/search_test.go b/internal/postgres/search_test.go index ca841cac..5d450408 100644 --- a/internal/postgres/search_test.go +++ b/internal/postgres/search_test.go @@ -369,6 +369,12 @@ func TestInsertSearchDocumentAndSearch(t *testing.T) { } ) + const ( + packageScore = 0.6079270839691162 + goAndCDKScore = 0.999817967414856 + cloudScore = 0.8654518127441406 + ) + for _, tc := range []struct { name string packages map[string]*internal.Package @@ -384,8 +390,8 @@ func TestInsertSearchDocumentAndSearch(t *testing.T) { modKube: pkgKube, }, want: []*internal.SearchResult{ - goCdkResult(0.2431708425283432, 2), - kubeResult(0.2431708425283432, 2), + goCdkResult(packageScore, 2), + kubeResult(packageScore, 2), }, }, { @@ -398,7 +404,7 @@ func TestInsertSearchDocumentAndSearch(t *testing.T) { modGoCDK: pkgGoCDK, }, want: []*internal.SearchResult{ - goCdkResult(0.2431708425283432, 2), + goCdkResult(packageScore, 2), }, }, { @@ -411,7 +417,7 @@ func TestInsertSearchDocumentAndSearch(t *testing.T) { modKube: pkgKube, }, want: []*internal.SearchResult{ - kubeResult(0.2431708425283432, 2), + kubeResult(packageScore, 2), }, }, { @@ -422,7 +428,7 @@ func TestInsertSearchDocumentAndSearch(t *testing.T) { modKube: pkgKube, }, want: []*internal.SearchResult{ - goCdkResult(0.733867883682251, 1), + goCdkResult(goAndCDKScore, 1), }, }, { @@ -432,7 +438,7 @@ func TestInsertSearchDocumentAndSearch(t *testing.T) { modGoCDK: pkgGoCDK, }, want: []*internal.SearchResult{ - goCdkResult(0.7109370231628418, 1), + goCdkResult(cloudScore, 1), }, }, } { @@ -820,28 +826,42 @@ func TestGetPackagesForSearchDocumentUpsert(t *testing.T) { } // pkgPaths should be "A", since pkg "A" exists in packages but not // search_documents. - pkgPaths, err := testDB.GetPackagesForSearchDocumentUpsert(ctx, 10) + got, err := testDB.GetPackagesForSearchDocumentUpsert(ctx, 10) if err != nil { t.Fatal(err) } - want := []string{"A", "A/notinternal"} - if diff := cmp.Diff(want, pkgPaths); diff != "" { + sort.Slice(got, func(i, j int) bool { return got[i].PackagePath < got[j].PackagePath }) + want := []upsertSearchDocumentArgs{ + { + PackagePath: "A", + ModulePath: moduleA.ModulePath, + ReadmeFilePath: "README.md", + ReadmeContents: "readme", + }, + { + PackagePath: "A/notinternal", + ModulePath: moduleA.ModulePath, + ReadmeFilePath: "README.md", + ReadmeContents: "readme", + }, + } + if diff := cmp.Diff(want, got); diff != "" { t.Fatalf("testDB.GetPackagesForSearchDocumentUpsert mismatch(-want +got):\n%s", diff) } - for _, path := range want { - if err := testDB.UpsertSearchDocument(ctx, path); err != nil { + for _, args := range got { + if err := testDB.UpsertSearchDocument(ctx, args); err != nil { t.Fatal(err) } } // pkgPaths should be an empty slice, since pkg "A" and "A/notinternal" // were just inserted into search_documents. - pkgPaths, err = testDB.GetPackagesForSearchDocumentUpsert(ctx, 10) + got, err = testDB.GetPackagesForSearchDocumentUpsert(ctx, 10) if err != nil { t.Fatal(err) } - if len(pkgPaths) != 0 { - t.Fatalf("expected testDB.GetPackagesForSearchDocumentUpsert to return an empty slice; got %v", pkgPaths) + if len(got) != 0 { + t.Fatalf("expected testDB.GetPackagesForSearchDocumentUpsert to return an empty slice; got %v", got) } } diff --git a/internal/postgres/searchdoc.go b/internal/postgres/searchdoc.go new file mode 100644 index 00000000..8bf1a9c1 --- /dev/null +++ b/internal/postgres/searchdoc.go @@ -0,0 +1,213 @@ +// Copyright 2020 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +package postgres + +import ( + "path/filepath" + "strings" + "unicode" + + "github.com/russross/blackfriday/v2" +) + +const ( + maxSectionWords = 50 + maxReadmeFraction = 0.5 +) + +// SearchDocumentSections computes the B and C sections of a Postgres search +// document from a package synopsis and a README. +// By "B section" and "C section" we mean the portion of the tsvector with weight +// "B" and "C", respectively. +// +// The B section consists of the synopsis. +// The C section consists of the first sentence of the README. +// The D section consists of the remainder of the README. +// All sections are split into words and processed for replacements. +// Each section is limited to maxSectionWords words, and in addition the +// D section is limited to an initial fraction of the README, determined +// by maxReadmeFraction. +func SearchDocumentSections(synopsis, readmeFilename, readme string) (b, c, d string) { + return searchDocumentSections(synopsis, readmeFilename, readme, maxSectionWords, maxReadmeFraction) +} + +func searchDocumentSections(synopsis, readmeFilename, readme string, maxSecWords int, maxReadmeFrac float64) (b, c, d string) { + var readmeFirst, readmeRest string + if isMarkdown(readmeFilename) { + readme = processMarkdown(readme) + } + if i := sentenceEndIndex(readme); i > 0 { + readmeFirst, readmeRest = readme[:i+1], readme[i+1:] + } else { + readmeRest = readme + } + sw := processWords(synopsis) + rwf := processWords(readmeFirst) + rwr := processWords(readmeRest) + + sectionB, _ := split(sw, maxSecWords) + sectionC, rwfd := split(rwf, maxSecWords) + // section D is the part of the readme that is not in sectionC. + rwd := append(rwfd, rwr...) + // Keep maxSecWords of section D, but not more than maxReadmeFrac. + f := int(maxReadmeFrac * float64(len(rwd))) + nkeep := maxSecWords + if nkeep > f { + nkeep = f + } + sectionD, _ := split(rwd, nkeep) + + // If there is no synopsis, use first sentence of the README. + // But do not promote the rest of the README to section C. + if len(sectionB) == 0 { + sectionB = sectionC + sectionC = nil + } + + prep := func(ws []string) string { + return makeValidUnicode(strings.Join(ws, " ")) + } + + return prep(sectionB), prep(sectionC), prep(sectionD) +} + +// split splits a slice of strings into two parts. The first has length <= n, +// and the second is the rest of the slice. If n is negative, the first part is nil and +// the second part is the entire slice. +func split(a []string, n int) ([]string, []string) { + if n >= len(a) { + return a, nil + } + return a[:n], a[n:] +} + +// sentenceEndIndex returns the index in s of the end of the first sentence, or +// -1 if no end can be found. A sentence ends at a '.', '!' or '?' that is +// followed by a space (or ends the string), and is not preceeded by an +// uppercase letter. +func sentenceEndIndex(s string) int { + var prev1, prev2 rune + + end := func() bool { + return !unicode.IsUpper(prev2) && (prev1 == '.' || prev1 == '!' || prev1 == '?') + } + + for i, r := range s { + if unicode.IsSpace(r) && end() { + return i - 1 + } + prev2 = prev1 + prev1 = r + } + if end() { + return len(s) - 1 + } + return -1 +} + +// processWords splits s into words at whitespace, then processes each word. +func processWords(s string) []string { + fields := strings.Fields(strings.ToLower(s)) + var words []string + for _, f := range fields { + words = append(words, processWord(f)...) + } + return words +} + +// summaryReplacements is used to replace words with other words. +// It is used by processWord, below. +// Example key-value pairs: +// "deleteMe": nil // removes "deleteMe" +// "rand": []string{"random"} // replace "rand" with "random" +// "utf-8": []string{"utf-8", "utf8"} // add "utf8" whenever "utf-8" is seen +var summaryReplacements = map[string][]string{ + "postgres": []string{"postgres", "postgresql"}, + "postgresql": []string{"postgres", "postgresql"}, + "rand": []string{"random"}, + "mongo": []string{"mongo", "mongodb"}, + "mongodb": []string{"mongo", "mongodb"}, + "redis": []string{"redis", "redisdb"}, + "redisdb": []string{"redis", "redisdb"}, + "logger": []string{"logger", "log"}, // Postgres stemmer does not handle -er + "parser": []string{"parser", "parse"}, + "utf-8": []string{"utf-8", "utf8"}, +} + +// processWord performs processing on s, returning zero or more words. +// Its main purpose is to apply summaryReplacements to replace +// certain words with synonyms or additional search terms. +func processWord(s string) []string { + s = strings.TrimFunc(s, unicode.IsPunct) + if s == "" { + return nil + } + if rs, ok := summaryReplacements[s]; ok { + return rs + } + if !hyphenSplit(s) { + return []string{s} + } + // Apply replacements to parts of hyphenated words. + ws := strings.Split(s, "-") + if len(ws) == 1 { + return ws + } + result := []string{s} // Include the full hyphenated word. + for _, w := range ws { + if rs, ok := summaryReplacements[w]; ok { + result = append(result, rs...) + } + // We don't need to include the parts; the Postgres text-search processor will do that. + } + return result +} + +// hyphenSplit reports whether s should be split on hyphens. +func hyphenSplit(s string) bool { + return !(strings.HasPrefix(s, "http://") || strings.HasPrefix(s, "https://")) +} + +// isMarkdown reports whether filename says that the file contains markdown. +func isMarkdown(filename string) bool { + ext := strings.ToLower(filepath.Ext(filename)) + // https://tools.ietf.org/html/rfc7763 mentions both extensions. + return ext == ".md" || ext == ".markdown" +} + +// processMarkdown returns the text of a markdown document. +// It omits all formatting and images. +func processMarkdown(s string) string { + parser := blackfriday.New(blackfriday.WithExtensions(blackfriday.CommonExtensions)) + root := parser.Parse([]byte(s)) + buf := walkMarkdown(root, nil, 0) + return string(buf) +} + +// walkMarkdown traverses a blackfriday parse tree, extracting text. +func walkMarkdown(n *blackfriday.Node, buf []byte, level int) []byte { + if n == nil { + return buf + } + switch n.Type { + case blackfriday.Image: + // Skip images because they usually are irrelevant to the package + // (badges and such). + return buf + case blackfriday.CodeBlock: + // Skip code blocks because they have a wide variety of unrelated symbols. + return buf + case blackfriday.Paragraph, blackfriday.Heading: + if len(buf) > 0 { + buf = append(buf, ' ') + } + default: + buf = append(buf, n.Literal...) + } + for c := n.FirstChild; c != nil; c = c.Next { + buf = walkMarkdown(c, buf, level+1) + } + return buf +} diff --git a/internal/postgres/searchdoc_test.go b/internal/postgres/searchdoc_test.go new file mode 100644 index 00000000..b8d374b7 --- /dev/null +++ b/internal/postgres/searchdoc_test.go @@ -0,0 +1,133 @@ +// Copyright 2020 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +package postgres + +import ( + "testing" + + "github.com/google/go-cmp/cmp" +) + +func TestSearchDocumentSections(t *testing.T) { + for _, test := range []struct { + name string + synopsis string + readmeFilename string + readmeContents string + wantB, wantC, wantD string + }{ + { + "blackfriday", + "This is a synopsis.", + "foo.md", + `Package blackfriday is a [markdown](http://foo) processor. That _is_ all that it is.`, + + "this is a synopsis", + "package blackfriday is a markdown processor", + "that is all", + }, + { + "non-markdown", + "This synopsis is too long so we'll truncate it.", + "README", + "This README doesn't have a sentence end so the whole thing is D", + + "this synopsis is too long so", + "", + "this readme doesn't have a sentence", + }, + { + "viper", + "", + "README.md", + ` +![viper logo](https://cloud.githubusercontent.com/assets/173412/10886745/998df88a-8151-11e5-9448-4736db51020d.png) + +Go configuration with fangs! + +[![Actions](https://github.com/spf13/viper/workflows/CI/badge.svg)](https://github.com/spf13/viper) +[![Join the chat at https://gitter.im/spf13/viper](https://badges.gitter.im/Join%20Chat.svg)](https://gitter.im/spf13/viper?utm_source=badge&utm_medium=badge&utm_campaign=pr-badge&utm_content=badge) +[![GoDoc](https://godoc.org/github.com/spf13/viper?status.svg)](https://godoc.org/github.com/spf13/viper) + +Many Go projects are built using Viper including:`, + + "go configuration with fangs", // first sentence of README promoted + "", + "many go projects are", + }, + } { + gotB, gotC, gotD := searchDocumentSections(test.synopsis, test.readmeFilename, test.readmeContents, 6, 0.5) + if gotB != test.wantB { + t.Errorf("%s, B: got %q, want %q", test.name, gotB, test.wantB) + } + if gotC != test.wantC { + t.Errorf("%s, C: got %q, want %q", test.name, gotC, test.wantC) + } + if gotD != test.wantD { + t.Errorf("%s, D: got %q, want %q", test.name, gotD, test.wantD) + } + } +} + +func TestProcessWords(t *testing.T) { + for _, test := range []struct { + in string + want []string + }{ + {"", nil}, + {"foo", []string{"foo"}}, + {" foo \t bar\n", []string{"foo", "bar"}}, + {"http://foo/bar/baz?x=1", []string{"http://foo/bar/baz?x=1"}}, + {"This, however, shall. not; stand?", []string{"this", "however", "shall", "not", "stand"}}, + {"a postgres and NATS server over HTTP", []string{ + "a", "postgres", "postgresql", "and", "nats", "server", "over", "http"}}, + {"http://a-b-c.com full-text chart-parser", []string{ + "http://a-b-c.com", "full-text", "chart-parser", "parser", "parse"}}, + } { + got := processWords(test.in) + if !cmp.Equal(got, test.want) { + t.Errorf("%q:\ngot %#v\nwant %#v", test.in, got, test.want) + } + } +} + +func TestProcessMarkdown(t *testing.T) { + const ( + in = ` +Blackfriday [![Build Status](https://travis-ci.org/russross/blackfriday.svg?branch=master)](https://travis-ci.org/russross/blackfriday) +=========== + +_Blackfriday_ is a [Markdown][1] *processor* implemented in [Go](https://golang.org). + +[1]: https://daringfireball.net/projects/markdown/ "Markdown" +` + + want = `Blackfriday Blackfriday is a Markdown processor implemented in Go.` + ) + + got := processMarkdown(in) + if got != want { + t.Errorf("got\n%s\nwant\n%s", got, want) + } +} + +func TestSentenceEndIndex(t *testing.T) { + for _, test := range []struct { + in string + want int + }{ + {"", -1}, + {"Hello. What's up?", 5}, + {"unicode π∆!", 13}, + {"D. C. Fontana?", 13}, + {"D. c. Fontana?", 4}, + {"no end", -1}, + } { + got := sentenceEndIndex(test.in) + if got != test.want { + t.Errorf("%s: got %d, want %d", test.in, got, test.want) + } + } +} diff --git a/internal/worker/server.go b/internal/worker/server.go index 165344b7..dcb8b3eb 100644 --- a/internal/worker/server.go +++ b/internal/worker/server.go @@ -163,16 +163,16 @@ func (s *Server) handlePopulateSearchDocuments(w http.ResponseWriter, r *http.Re limit := parseIntParam(r, "limit", 100) ctx := r.Context() log.Infof(ctx, "Populating search documents for %d packages", limit) - pkgPaths, err := s.db.GetPackagesForSearchDocumentUpsert(ctx, limit) + sdargs, err := s.db.GetPackagesForSearchDocumentUpsert(ctx, limit) if err != nil { log.Errorf(ctx, "s.db.GetPackagesSearchDocumentUpsert(ctx): %v", err) http.Error(w, http.StatusText(http.StatusInternalServerError), http.StatusInternalServerError) return } - for _, path := range pkgPaths { - if err := s.db.UpsertSearchDocument(ctx, path); err != nil { - log.Errorf(ctx, "s.db.UpsertSearchDocument(ctx, %q): %v", path, err) + for _, args := range sdargs { + if err := s.db.UpsertSearchDocument(ctx, args); err != nil { + log.Errorf(ctx, "s.db.UpsertSearchDocument(ctx, %v): %v", args, err) http.Error(w, http.StatusText(http.StatusInternalServerError), http.StatusInternalServerError) return }