pkgsite/internal/postgres/searchdoc.go

// Copyright 2020 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.

package postgres

import (
	"path/filepath"
	"strings"
	"unicode"

	"github.com/russross/blackfriday/v2"
)

const (
	maxSectionWords   = 50
	maxReadmeFraction = 0.5
)

// SearchDocumentSections computes the B and C sections of a Postgres search
// document from a package synopsis and a README.
// By "B section" and "C section" we mean the portion of the tsvector with weight
// "B" and "C", respectively.
//
// The B section consists of the synopsis.
// The C section consists of the first sentence of the README.
// The D section consists of the remainder of the README.
// All sections are split into words and processed for replacements.
// Each section is limited to maxSectionWords words, and in addition the
// D section is limited to an initial fraction of the README, determined
// by maxReadmeFraction.
func SearchDocumentSections(synopsis, readmeFilename, readme string) (b, c, d string) {
	return searchDocumentSections(synopsis, readmeFilename, readme, maxSectionWords, maxReadmeFraction)
}

func searchDocumentSections(synopsis, readmeFilename, readme string, maxSecWords int, maxReadmeFrac float64) (b, c, d string) {
	var readmeFirst, readmeRest string
	if isMarkdown(readmeFilename) {
		readme = processMarkdown(readme)
	}
	if i := sentenceEndIndex(readme); i > 0 {
		readmeFirst, readmeRest = readme[:i+1], readme[i+1:]
	} else {
		readmeRest = readme
	}
	sw := processWords(synopsis)
	rwf := processWords(readmeFirst)
	rwr := processWords(readmeRest)

	sectionB, _ := split(sw, maxSecWords)
	sectionC, rwfd := split(rwf, maxSecWords)
	// section D is the part of the readme that is not in sectionC.
	rwd := append(rwfd, rwr...)
	// Keep maxSecWords of section D, but not more than maxReadmeFrac.
	f := int(maxReadmeFrac * float64(len(rwd)))
	nkeep := maxSecWords
	if nkeep > f {
		nkeep = f
	}
	sectionD, _ := split(rwd, nkeep)

	// If there is no synopsis, use first sentence of the README.
	// But do not promote the rest of the README to section C.
	if len(sectionB) == 0 {
		sectionB = sectionC
		sectionC = nil
	}

	prep := func(ws []string) string {
		return makeValidUnicode(strings.Join(ws, " "))
	}

	return prep(sectionB), prep(sectionC), prep(sectionD)
}

// split splits a slice of strings into two parts. The first has length <= n,
// and the second is the rest of the slice. If n is negative, the first part is nil and
// the second part is the entire slice.
func split(a []string, n int) ([]string, []string) {
	if n >= len(a) {
		return a, nil
	}
	return a[:n], a[n:]
}

// sentenceEndIndex returns the index in s of the end of the first sentence, or
// -1 if no end can be found. A sentence ends at a '.', '!' or '?' that is
// followed by a space (or ends the string), and is not preceded by an
// uppercase letter.
func sentenceEndIndex(s string) int {
	var prev1, prev2 rune

	end := func() bool {
		return !unicode.IsUpper(prev2) && (prev1 == '.' || prev1 == '!' || prev1 == '?')
	}

	for i, r := range s {
		if unicode.IsSpace(r) && end() {
			return i - 1
		}
		prev2 = prev1
		prev1 = r
	}
	if end() {
		return len(s) - 1
	}
	return -1
}

// processWords splits s into words at whitespace, then processes each word.
func processWords(s string) []string {
	fields := strings.Fields(strings.ToLower(s))
	var words []string
	for _, f := range fields {
		words = append(words, processWord(f)...)
	}
	return words
}

// summaryReplacements is used to replace words with other words.
// It is used by processWord, below.
// Example key-value pairs:
//
//	"deleteMe": nil					 // removes "deleteMe"
//	"rand": []string{"random"}			 // replace "rand" with "random"
//	"utf-8": []string{"utf-8", "utf8"}  // add "utf8" whenever "utf-8" is seen
var summaryReplacements = map[string][]string{
	"postgres":   {"postgres", "postgresql"},
	"postgresql": {"postgres", "postgresql"},
	"rand":       {"random"},
	"mongo":      {"mongo", "mongodb"},
	"mongodb":    {"mongo", "mongodb"},
	"redis":      {"redis", "redisdb"},
	"redisdb":    {"redis", "redisdb"},
	"logger":     {"logger", "log"}, // Postgres stemmer does not handle -er
	"parser":     {"parser", "parse"},
	"utf-8":      {"utf-8", "utf8"},
}

// processWord performs processing on s, returning zero or more words.
// Its main purpose is to apply summaryReplacements to replace
// certain words with synonyms or additional search terms.
func processWord(s string) []string {
	s = strings.TrimFunc(s, unicode.IsPunct)
	if s == "" {
		return nil
	}
	if rs, ok := summaryReplacements[s]; ok {
		return rs
	}
	if !hyphenSplit(s) {
		return []string{s}
	}
	// Apply replacements to parts of hyphenated words.
	ws := strings.Split(s, "-")
	if len(ws) == 1 {
		return ws
	}
	result := []string{s} // Include the full hyphenated word.
	for _, w := range ws {
		if rs, ok := summaryReplacements[w]; ok {
			result = append(result, rs...)
		}
		// We don't need to include the parts; the Postgres text-search processor will do that.
	}
	return result
}

// hyphenSplit reports whether s should be split on hyphens.
func hyphenSplit(s string) bool {
	return !(strings.HasPrefix(s, "http://") || strings.HasPrefix(s, "https://"))
}

// isMarkdown reports whether filename says that the file contains markdown.
func isMarkdown(filename string) bool {
	ext := strings.ToLower(filepath.Ext(filename))
	// https://tools.ietf.org/html/rfc7763 mentions both extensions.
	return ext == ".md" || ext == ".markdown"
}

// processMarkdown returns the text of a markdown document.
// It omits all formatting and images.
func processMarkdown(s string) string {
	parser := blackfriday.New(blackfriday.WithExtensions(blackfriday.CommonExtensions))
	root := parser.Parse([]byte(s))
	buf := walkMarkdown(root, nil, 0)
	return string(buf)
}

// walkMarkdown traverses a blackfriday parse tree, extracting text.
func walkMarkdown(n *blackfriday.Node, buf []byte, level int) []byte {
	if n == nil {
		return buf
	}
	switch n.Type {
	case blackfriday.Image:
		// Skip images because they usually are irrelevant to the package
		// (badges and such).
		return buf
	case blackfriday.CodeBlock:
		// Skip code blocks because they have a wide variety of unrelated symbols.
		return buf
	case blackfriday.Paragraph, blackfriday.Heading:
		if len(buf) > 0 {
			buf = append(buf, ' ')
		}
	default:
		buf = append(buf, n.Literal...)
	}
	for c := n.FirstChild; c != nil; c = c.Next {
		buf = walkMarkdown(c, buf, level+1)
	}
	return buf
}
internal/postgres: change search tokens Change the way that the Postgres ts_vector (list of search tokens) is computed. - Use the path_tokens text configuration when creating the tsvector for the path. - Construct sections B and C of the search document by combining the synopsis part of the README. Parts of this processing are: - Extract only the text of a markdown README, to remove images and other extraneous information. - Add alternatives to certain words in the synopsis and README. For example, add "postgresql" whenever we see "postgres". - Modify the ts_rank call in the code to use a B weight of 1. - Change the call to the database search function so that it invokes the function that has a B weight of 1. These changes will require re-computing the search_documents.tsv_search_tokens column. That should be done after these are deployed. Change-Id: Ib81601326f11efd81c8bc733694a000eccecf12b Reviewed-on: https://team-review.git.corp.google.com/c/golang/discovery/+/705958 Reviewed-by: Julie Qiu <julieqiu@google.com> 2020-03-31 00:18:52 +03:00			`// Copyright 2020 The Go Authors. All rights reserved.`
			`// Use of this source code is governed by a BSD-style`
			`// license that can be found in the LICENSE file.`

			`package postgres`

			`import (`
			`"path/filepath"`
			`"strings"`
			`"unicode"`

			`"github.com/russross/blackfriday/v2"`
			`)`

			`const (`
			`maxSectionWords = 50`
			`maxReadmeFraction = 0.5`
			`)`

			`// SearchDocumentSections computes the B and C sections of a Postgres search`
			`// document from a package synopsis and a README.`
			`// By "B section" and "C section" we mean the portion of the tsvector with weight`
			`// "B" and "C", respectively.`
			`//`
			`// The B section consists of the synopsis.`
			`// The C section consists of the first sentence of the README.`
			`// The D section consists of the remainder of the README.`
			`// All sections are split into words and processed for replacements.`
			`// Each section is limited to maxSectionWords words, and in addition the`
			`// D section is limited to an initial fraction of the README, determined`
			`// by maxReadmeFraction.`
			`func SearchDocumentSections(synopsis, readmeFilename, readme string) (b, c, d string) {`
			`return searchDocumentSections(synopsis, readmeFilename, readme, maxSectionWords, maxReadmeFraction)`
			`}`

			`func searchDocumentSections(synopsis, readmeFilename, readme string, maxSecWords int, maxReadmeFrac float64) (b, c, d string) {`
			`var readmeFirst, readmeRest string`
			`if isMarkdown(readmeFilename) {`
			`readme = processMarkdown(readme)`
			`}`
			`if i := sentenceEndIndex(readme); i > 0 {`
			`readmeFirst, readmeRest = readme[:i+1], readme[i+1:]`
			`} else {`
			`readmeRest = readme`
			`}`
			`sw := processWords(synopsis)`
			`rwf := processWords(readmeFirst)`
			`rwr := processWords(readmeRest)`

			`sectionB, _ := split(sw, maxSecWords)`
			`sectionC, rwfd := split(rwf, maxSecWords)`
			`// section D is the part of the readme that is not in sectionC.`
			`rwd := append(rwfd, rwr...)`
			`// Keep maxSecWords of section D, but not more than maxReadmeFrac.`
			`f := int(maxReadmeFrac * float64(len(rwd)))`
			`nkeep := maxSecWords`
			`if nkeep > f {`
			`nkeep = f`
			`}`
			`sectionD, _ := split(rwd, nkeep)`

			`// If there is no synopsis, use first sentence of the README.`
			`// But do not promote the rest of the README to section C.`
			`if len(sectionB) == 0 {`
			`sectionB = sectionC`
			`sectionC = nil`
			`}`

			`prep := func(ws []string) string {`
			`return makeValidUnicode(strings.Join(ws, " "))`
			`}`

			`return prep(sectionB), prep(sectionC), prep(sectionD)`
			`}`

			`// split splits a slice of strings into two parts. The first has length <= n,`
			`// and the second is the rest of the slice. If n is negative, the first part is nil and`
			`// the second part is the entire slice.`
			`func split(a []string, n int) ([]string, []string) {`
			`if n >= len(a) {`
			`return a, nil`
			`}`
			`return a[:n], a[n:]`
			`}`

			`// sentenceEndIndex returns the index in s of the end of the first sentence, or`
			`// -1 if no end can be found. A sentence ends at a '.', '!' or '?' that is`
internal/postgres: fix misspelling of preceded Change-Id: I16fbe356ee414302babb1e45d8bc683f0fc861c2 Reviewed-on: https://team-review.git.corp.google.com/c/golang/discovery/+/717236 Reviewed-by: Jonathan Amsterdam <jba@google.com> 2020-04-14 23:49:29 +03:00			`// followed by a space (or ends the string), and is not preceded by an`
internal/postgres: change search tokens Change the way that the Postgres ts_vector (list of search tokens) is computed. - Use the path_tokens text configuration when creating the tsvector for the path. - Construct sections B and C of the search document by combining the synopsis part of the README. Parts of this processing are: - Extract only the text of a markdown README, to remove images and other extraneous information. - Add alternatives to certain words in the synopsis and README. For example, add "postgresql" whenever we see "postgres". - Modify the ts_rank call in the code to use a B weight of 1. - Change the call to the database search function so that it invokes the function that has a B weight of 1. These changes will require re-computing the search_documents.tsv_search_tokens column. That should be done after these are deployed. Change-Id: Ib81601326f11efd81c8bc733694a000eccecf12b Reviewed-on: https://team-review.git.corp.google.com/c/golang/discovery/+/705958 Reviewed-by: Julie Qiu <julieqiu@google.com> 2020-03-31 00:18:52 +03:00			`// uppercase letter.`
			`func sentenceEndIndex(s string) int {`
			`var prev1, prev2 rune`

			`end := func() bool {`
			`return !unicode.IsUpper(prev2) && (prev1 == '.' \|\| prev1 == '!' \|\| prev1 == '?')`
			`}`

			`for i, r := range s {`
			`if unicode.IsSpace(r) && end() {`
			`return i - 1`
			`}`
			`prev2 = prev1`
			`prev1 = r`
			`}`
			`if end() {`
			`return len(s) - 1`
			`}`
			`return -1`
			`}`

			`// processWords splits s into words at whitespace, then processes each word.`
			`func processWords(s string) []string {`
			`fields := strings.Fields(strings.ToLower(s))`
			`var words []string`
			`for _, f := range fields {`
			`words = append(words, processWord(f)...)`
			`}`
			`return words`
			`}`

			`// summaryReplacements is used to replace words with other words.`
			`// It is used by processWord, below.`
			`// Example key-value pairs:`
all: gofmt Gofmt to update doc comments to the new formatting. For golang/go#51082. Change-Id: Ia9e71e7ecac75822ff43d6c7e60f512442a5fa50 Reviewed-on: https://go-review.googlesource.com/c/pkgsite/+/399617 Run-TryBot: Russ Cox <rsc@golang.org> TryBot-Result: kokoro <noreply+kokoro@google.com> Auto-Submit: Russ Cox <rsc@golang.org> Reviewed-by: Jonathan Amsterdam <jba@google.com> 2022-04-11 20:12:03 +03:00			`//`
			`// "deleteMe": nil // removes "deleteMe"`
			`// "rand": []string{"random"} // replace "rand" with "random"`
			`// "utf-8": []string{"utf-8", "utf8"} // add "utf8" whenever "utf-8" is seen`
internal/postgres: change search tokens Change the way that the Postgres ts_vector (list of search tokens) is computed. - Use the path_tokens text configuration when creating the tsvector for the path. - Construct sections B and C of the search document by combining the synopsis part of the README. Parts of this processing are: - Extract only the text of a markdown README, to remove images and other extraneous information. - Add alternatives to certain words in the synopsis and README. For example, add "postgresql" whenever we see "postgres". - Modify the ts_rank call in the code to use a B weight of 1. - Change the call to the database search function so that it invokes the function that has a B weight of 1. These changes will require re-computing the search_documents.tsv_search_tokens column. That should be done after these are deployed. Change-Id: Ib81601326f11efd81c8bc733694a000eccecf12b Reviewed-on: https://team-review.git.corp.google.com/c/golang/discovery/+/705958 Reviewed-by: Julie Qiu <julieqiu@google.com> 2020-03-31 00:18:52 +03:00			`var summaryReplacements = map[string][]string{`
internal: gofmt -s Excluded changes that would break tests or be undone by 'go generate'. Change-Id: I111ba19cc6d948d84d2c9ecb3271e17b978816d3 Reviewed-on: https://go-review.googlesource.com/c/pkgsite/+/237903 Reviewed-by: Matt Layher <mdlayher@gmail.com> Reviewed-by: Julie Qiu <julie@golang.org> Run-TryBot: Matt Layher <mdlayher@gmail.com> 2020-06-15 19:36:50 +03:00			`"postgres": {"postgres", "postgresql"},`
			`"postgresql": {"postgres", "postgresql"},`
			`"rand": {"random"},`
			`"mongo": {"mongo", "mongodb"},`
			`"mongodb": {"mongo", "mongodb"},`
			`"redis": {"redis", "redisdb"},`
			`"redisdb": {"redis", "redisdb"},`
			`"logger": {"logger", "log"}, // Postgres stemmer does not handle -er`
			`"parser": {"parser", "parse"},`
			`"utf-8": {"utf-8", "utf8"},`
internal/postgres: change search tokens Change the way that the Postgres ts_vector (list of search tokens) is computed. - Use the path_tokens text configuration when creating the tsvector for the path. - Construct sections B and C of the search document by combining the synopsis part of the README. Parts of this processing are: - Extract only the text of a markdown README, to remove images and other extraneous information. - Add alternatives to certain words in the synopsis and README. For example, add "postgresql" whenever we see "postgres". - Modify the ts_rank call in the code to use a B weight of 1. - Change the call to the database search function so that it invokes the function that has a B weight of 1. These changes will require re-computing the search_documents.tsv_search_tokens column. That should be done after these are deployed. Change-Id: Ib81601326f11efd81c8bc733694a000eccecf12b Reviewed-on: https://team-review.git.corp.google.com/c/golang/discovery/+/705958 Reviewed-by: Julie Qiu <julieqiu@google.com> 2020-03-31 00:18:52 +03:00			`}`

			`// processWord performs processing on s, returning zero or more words.`
			`// Its main purpose is to apply summaryReplacements to replace`
			`// certain words with synonyms or additional search terms.`
			`func processWord(s string) []string {`
			`s = strings.TrimFunc(s, unicode.IsPunct)`
			`if s == "" {`
			`return nil`
			`}`
			`if rs, ok := summaryReplacements[s]; ok {`
			`return rs`
			`}`
			`if !hyphenSplit(s) {`
			`return []string{s}`
			`}`
			`// Apply replacements to parts of hyphenated words.`
			`ws := strings.Split(s, "-")`
			`if len(ws) == 1 {`
			`return ws`
			`}`
			`result := []string{s} // Include the full hyphenated word.`
			`for _, w := range ws {`
			`if rs, ok := summaryReplacements[w]; ok {`
			`result = append(result, rs...)`
			`}`
			`// We don't need to include the parts; the Postgres text-search processor will do that.`
			`}`
			`return result`
			`}`

			`// hyphenSplit reports whether s should be split on hyphens.`
			`func hyphenSplit(s string) bool {`
			`return !(strings.HasPrefix(s, "http://") \|\| strings.HasPrefix(s, "https://"))`
			`}`

			`// isMarkdown reports whether filename says that the file contains markdown.`
			`func isMarkdown(filename string) bool {`
			`ext := strings.ToLower(filepath.Ext(filename))`
			`// https://tools.ietf.org/html/rfc7763 mentions both extensions.`
			`return ext == ".md" \|\| ext == ".markdown"`
			`}`

			`// processMarkdown returns the text of a markdown document.`
			`// It omits all formatting and images.`
			`func processMarkdown(s string) string {`
			`parser := blackfriday.New(blackfriday.WithExtensions(blackfriday.CommonExtensions))`
			`root := parser.Parse([]byte(s))`
			`buf := walkMarkdown(root, nil, 0)`
			`return string(buf)`
			`}`

			`// walkMarkdown traverses a blackfriday parse tree, extracting text.`
			`func walkMarkdown(n *blackfriday.Node, buf []byte, level int) []byte {`
			`if n == nil {`
			`return buf`
			`}`
			`switch n.Type {`
			`case blackfriday.Image:`
			`// Skip images because they usually are irrelevant to the package`
			`// (badges and such).`
			`return buf`
			`case blackfriday.CodeBlock:`
			`// Skip code blocks because they have a wide variety of unrelated symbols.`
			`return buf`
			`case blackfriday.Paragraph, blackfriday.Heading:`
			`if len(buf) > 0 {`
			`buf = append(buf, ' ')`
			`}`
			`default:`
			`buf = append(buf, n.Literal...)`
			`}`
			`for c := n.FirstChild; c != nil; c = c.Next {`
			`buf = walkMarkdown(c, buf, level+1)`
			`}`
			`return buf`
			`}`