2020-03-31 00:18:52 +03:00
|
|
|
// Copyright 2020 The Go Authors. All rights reserved.
|
|
|
|
// Use of this source code is governed by a BSD-style
|
|
|
|
// license that can be found in the LICENSE file.
|
|
|
|
|
|
|
|
package postgres
|
|
|
|
|
|
|
|
import (
|
|
|
|
"path/filepath"
|
|
|
|
"strings"
|
|
|
|
"unicode"
|
|
|
|
|
|
|
|
"github.com/russross/blackfriday/v2"
|
|
|
|
)
|
|
|
|
|
|
|
|
const (
|
|
|
|
maxSectionWords = 50
|
|
|
|
maxReadmeFraction = 0.5
|
|
|
|
)
|
|
|
|
|
|
|
|
// SearchDocumentSections computes the B and C sections of a Postgres search
|
|
|
|
// document from a package synopsis and a README.
|
|
|
|
// By "B section" and "C section" we mean the portion of the tsvector with weight
|
|
|
|
// "B" and "C", respectively.
|
|
|
|
//
|
|
|
|
// The B section consists of the synopsis.
|
|
|
|
// The C section consists of the first sentence of the README.
|
|
|
|
// The D section consists of the remainder of the README.
|
|
|
|
// All sections are split into words and processed for replacements.
|
|
|
|
// Each section is limited to maxSectionWords words, and in addition the
|
|
|
|
// D section is limited to an initial fraction of the README, determined
|
|
|
|
// by maxReadmeFraction.
|
|
|
|
func SearchDocumentSections(synopsis, readmeFilename, readme string) (b, c, d string) {
|
|
|
|
return searchDocumentSections(synopsis, readmeFilename, readme, maxSectionWords, maxReadmeFraction)
|
|
|
|
}
|
|
|
|
|
|
|
|
func searchDocumentSections(synopsis, readmeFilename, readme string, maxSecWords int, maxReadmeFrac float64) (b, c, d string) {
|
|
|
|
var readmeFirst, readmeRest string
|
|
|
|
if isMarkdown(readmeFilename) {
|
|
|
|
readme = processMarkdown(readme)
|
|
|
|
}
|
|
|
|
if i := sentenceEndIndex(readme); i > 0 {
|
|
|
|
readmeFirst, readmeRest = readme[:i+1], readme[i+1:]
|
|
|
|
} else {
|
|
|
|
readmeRest = readme
|
|
|
|
}
|
|
|
|
sw := processWords(synopsis)
|
|
|
|
rwf := processWords(readmeFirst)
|
|
|
|
rwr := processWords(readmeRest)
|
|
|
|
|
|
|
|
sectionB, _ := split(sw, maxSecWords)
|
|
|
|
sectionC, rwfd := split(rwf, maxSecWords)
|
|
|
|
// section D is the part of the readme that is not in sectionC.
|
|
|
|
rwd := append(rwfd, rwr...)
|
|
|
|
// Keep maxSecWords of section D, but not more than maxReadmeFrac.
|
|
|
|
f := int(maxReadmeFrac * float64(len(rwd)))
|
|
|
|
nkeep := maxSecWords
|
|
|
|
if nkeep > f {
|
|
|
|
nkeep = f
|
|
|
|
}
|
|
|
|
sectionD, _ := split(rwd, nkeep)
|
|
|
|
|
|
|
|
// If there is no synopsis, use first sentence of the README.
|
|
|
|
// But do not promote the rest of the README to section C.
|
|
|
|
if len(sectionB) == 0 {
|
|
|
|
sectionB = sectionC
|
|
|
|
sectionC = nil
|
|
|
|
}
|
|
|
|
|
|
|
|
prep := func(ws []string) string {
|
|
|
|
return makeValidUnicode(strings.Join(ws, " "))
|
|
|
|
}
|
|
|
|
|
|
|
|
return prep(sectionB), prep(sectionC), prep(sectionD)
|
|
|
|
}
|
|
|
|
|
|
|
|
// split splits a slice of strings into two parts. The first has length <= n,
|
|
|
|
// and the second is the rest of the slice. If n is negative, the first part is nil and
|
|
|
|
// the second part is the entire slice.
|
|
|
|
func split(a []string, n int) ([]string, []string) {
|
|
|
|
if n >= len(a) {
|
|
|
|
return a, nil
|
|
|
|
}
|
|
|
|
return a[:n], a[n:]
|
|
|
|
}
|
|
|
|
|
|
|
|
// sentenceEndIndex returns the index in s of the end of the first sentence, or
|
|
|
|
// -1 if no end can be found. A sentence ends at a '.', '!' or '?' that is
|
2020-04-14 23:49:29 +03:00
|
|
|
// followed by a space (or ends the string), and is not preceded by an
|
2020-03-31 00:18:52 +03:00
|
|
|
// uppercase letter.
|
|
|
|
func sentenceEndIndex(s string) int {
|
|
|
|
var prev1, prev2 rune
|
|
|
|
|
|
|
|
end := func() bool {
|
|
|
|
return !unicode.IsUpper(prev2) && (prev1 == '.' || prev1 == '!' || prev1 == '?')
|
|
|
|
}
|
|
|
|
|
|
|
|
for i, r := range s {
|
|
|
|
if unicode.IsSpace(r) && end() {
|
|
|
|
return i - 1
|
|
|
|
}
|
|
|
|
prev2 = prev1
|
|
|
|
prev1 = r
|
|
|
|
}
|
|
|
|
if end() {
|
|
|
|
return len(s) - 1
|
|
|
|
}
|
|
|
|
return -1
|
|
|
|
}
|
|
|
|
|
|
|
|
// processWords splits s into words at whitespace, then processes each word.
|
|
|
|
func processWords(s string) []string {
|
|
|
|
fields := strings.Fields(strings.ToLower(s))
|
|
|
|
var words []string
|
|
|
|
for _, f := range fields {
|
|
|
|
words = append(words, processWord(f)...)
|
|
|
|
}
|
|
|
|
return words
|
|
|
|
}
|
|
|
|
|
|
|
|
// summaryReplacements is used to replace words with other words.
|
|
|
|
// It is used by processWord, below.
|
|
|
|
// Example key-value pairs:
|
2022-04-11 20:12:03 +03:00
|
|
|
//
|
|
|
|
// "deleteMe": nil // removes "deleteMe"
|
|
|
|
// "rand": []string{"random"} // replace "rand" with "random"
|
|
|
|
// "utf-8": []string{"utf-8", "utf8"} // add "utf8" whenever "utf-8" is seen
|
2020-03-31 00:18:52 +03:00
|
|
|
var summaryReplacements = map[string][]string{
|
2020-06-15 19:36:50 +03:00
|
|
|
"postgres": {"postgres", "postgresql"},
|
|
|
|
"postgresql": {"postgres", "postgresql"},
|
|
|
|
"rand": {"random"},
|
|
|
|
"mongo": {"mongo", "mongodb"},
|
|
|
|
"mongodb": {"mongo", "mongodb"},
|
|
|
|
"redis": {"redis", "redisdb"},
|
|
|
|
"redisdb": {"redis", "redisdb"},
|
|
|
|
"logger": {"logger", "log"}, // Postgres stemmer does not handle -er
|
|
|
|
"parser": {"parser", "parse"},
|
|
|
|
"utf-8": {"utf-8", "utf8"},
|
2020-03-31 00:18:52 +03:00
|
|
|
}
|
|
|
|
|
|
|
|
// processWord performs processing on s, returning zero or more words.
|
|
|
|
// Its main purpose is to apply summaryReplacements to replace
|
|
|
|
// certain words with synonyms or additional search terms.
|
|
|
|
func processWord(s string) []string {
|
|
|
|
s = strings.TrimFunc(s, unicode.IsPunct)
|
|
|
|
if s == "" {
|
|
|
|
return nil
|
|
|
|
}
|
|
|
|
if rs, ok := summaryReplacements[s]; ok {
|
|
|
|
return rs
|
|
|
|
}
|
|
|
|
if !hyphenSplit(s) {
|
|
|
|
return []string{s}
|
|
|
|
}
|
|
|
|
// Apply replacements to parts of hyphenated words.
|
|
|
|
ws := strings.Split(s, "-")
|
|
|
|
if len(ws) == 1 {
|
|
|
|
return ws
|
|
|
|
}
|
|
|
|
result := []string{s} // Include the full hyphenated word.
|
|
|
|
for _, w := range ws {
|
|
|
|
if rs, ok := summaryReplacements[w]; ok {
|
|
|
|
result = append(result, rs...)
|
|
|
|
}
|
|
|
|
// We don't need to include the parts; the Postgres text-search processor will do that.
|
|
|
|
}
|
|
|
|
return result
|
|
|
|
}
|
|
|
|
|
|
|
|
// hyphenSplit reports whether s should be split on hyphens.
|
|
|
|
func hyphenSplit(s string) bool {
|
|
|
|
return !(strings.HasPrefix(s, "http://") || strings.HasPrefix(s, "https://"))
|
|
|
|
}
|
|
|
|
|
|
|
|
// isMarkdown reports whether filename says that the file contains markdown.
|
|
|
|
func isMarkdown(filename string) bool {
|
|
|
|
ext := strings.ToLower(filepath.Ext(filename))
|
|
|
|
// https://tools.ietf.org/html/rfc7763 mentions both extensions.
|
|
|
|
return ext == ".md" || ext == ".markdown"
|
|
|
|
}
|
|
|
|
|
|
|
|
// processMarkdown returns the text of a markdown document.
|
|
|
|
// It omits all formatting and images.
|
|
|
|
func processMarkdown(s string) string {
|
|
|
|
parser := blackfriday.New(blackfriday.WithExtensions(blackfriday.CommonExtensions))
|
|
|
|
root := parser.Parse([]byte(s))
|
|
|
|
buf := walkMarkdown(root, nil, 0)
|
|
|
|
return string(buf)
|
|
|
|
}
|
|
|
|
|
|
|
|
// walkMarkdown traverses a blackfriday parse tree, extracting text.
|
|
|
|
func walkMarkdown(n *blackfriday.Node, buf []byte, level int) []byte {
|
|
|
|
if n == nil {
|
|
|
|
return buf
|
|
|
|
}
|
|
|
|
switch n.Type {
|
|
|
|
case blackfriday.Image:
|
|
|
|
// Skip images because they usually are irrelevant to the package
|
|
|
|
// (badges and such).
|
|
|
|
return buf
|
|
|
|
case blackfriday.CodeBlock:
|
|
|
|
// Skip code blocks because they have a wide variety of unrelated symbols.
|
|
|
|
return buf
|
|
|
|
case blackfriday.Paragraph, blackfriday.Heading:
|
|
|
|
if len(buf) > 0 {
|
|
|
|
buf = append(buf, ' ')
|
|
|
|
}
|
|
|
|
default:
|
|
|
|
buf = append(buf, n.Literal...)
|
|
|
|
}
|
|
|
|
for c := n.FirstChild; c != nil; c = c.Next {
|
|
|
|
buf = walkMarkdown(c, buf, level+1)
|
|
|
|
}
|
|
|
|
return buf
|
|
|
|
}
|