pkgsite/internal/frontend/completion.go

// Copyright 2019 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.

package frontend

import (
	"bytes"
	"context"
	"encoding/json"
	"io"
	"net/http"
	"sort"
	"strings"

	"github.com/go-redis/redis/v7"
	"golang.org/x/pkgsite/internal/complete"
	"golang.org/x/pkgsite/internal/derrors"
	"golang.org/x/pkgsite/internal/log"
)

// handleAutoCompletion handles requests for /autocomplete?q=<input prefix>, by
// querying redis sorted sets indexing package paths.
func (s *Server) handleAutoCompletion(w http.ResponseWriter, r *http.Request) {
	ctx := r.Context()
	var completions []*complete.Completion
	if s.cmplClient != nil {
		var err error
		q := r.FormValue("q")
		completions, err = doCompletion(r.Context(), s.cmplClient, strings.ToLower(q), 5)
		if err != nil {
			code := http.StatusInternalServerError
			http.Error(w, http.StatusText(code), code)
			return
		}
	}
	if completions == nil {
		// autocomplete.js complains if the JSON returned by this endpoint is null,
		// so we initialize a non-nil empty array to serialize to an empty JSON
		// array.
		completions = []*complete.Completion{}
	}
	response, err := json.Marshal(completions)
	if err != nil {
		log.Errorf(ctx, "error marshalling completion: json.Marshal: %v", err)
	}
	w.Header().Set("Content-Type", "application/json")
	if _, err := io.Copy(w, bytes.NewReader(response)); err != nil {
		log.Errorf(ctx, "Error copying json buffer to ResponseWriter: %v", err)
	}
}

// scoredCompletion wraps Completions with a relevancy score, so that they can
// be sorted.
type scoredCompletion struct {
	c     *complete.Completion
	score int
}

// doCompletion executes the completion query against redis. This is inspired
// by http://oldblog.antirez.com/post/autocomplete-with-redis.html, but
// improved as follows:
//  + Use ZRANGEBYLEX to avoid storing each possible prefix, since that was
//    added to Redis since the original blog post.
//  + Use an additional sorted set that holds popular packages, to improve
//    completion relevancy.
//
// We autocomplete the query 'q' as follows
//  1. Query for popular completions starting with q using ZRANGEBYLEX (more
//     details on this below). We fetch an arbitrary number of results (1000)
//     to bound the amount of work done by redis.
//  2. Sort the returned completions by our score (a mix of popularity and
//     proximity to the end of the import path), and filter to the top
//     maxResults.
//  3. If we have maxResults results, we're done. Otherwise do (1) on the index
//     of remaining (unpopular) package paths, add to our result set, and sort
//     again (because unpopular packages might actually score higher than
//     popular packages).
func doCompletion(ctx context.Context, r *redis.Client, q string, maxResults int) (_ []*complete.Completion, err error) {
	defer derrors.Wrap(&err, "doCompletion(%q, %d)", q, maxResults)
	scored, err := completeWithIndex(ctx, r, q, complete.PopularKey, maxResults)
	if err != nil {
		return nil, err
	}
	if len(scored) < maxResults {
		unpopular, err := completeWithIndex(ctx, r, q, complete.RemainingKey, maxResults-len(scored))
		if err != nil {
			return nil, err
		}
		scored = append(scored, unpopular...)
		// Re-sort, as it is possible that an unpopular completion actually has a
		// higher score than a popular completion due to the weighting for suffix
		// length.
		sort.Slice(scored, func(i, j int) bool {
			return scored[i].score > scored[j].score
		})
	}
	var completions []*complete.Completion
	for _, s := range scored {
		completions = append(completions, s.c)
	}
	return completions, nil
}

func completeWithIndex(ctx context.Context, r *redis.Client, q, indexKey string, maxResults int) (_ []*scoredCompletion, err error) {
	defer derrors.Wrap(&err, "completeWithIndex(%q, %q, %d)", q, indexKey, maxResults)

	// Query for possible completions using ZRANGEBYLEX. See documentation at
	// https://redis.io/commands/zrangebylex
	// Notably, the "(" character in the Min and Max fields means 'exclude this
	// endpoint'.
	// We bound our search in two ways: (1) by setting Max to the smallest string
	// that lexically greater than q but does not start with q, and (2) by
	// setting an arbitrary limit of 1000 results.
	entries, err := r.WithContext(ctx).ZRangeByLex(indexKey, &redis.ZRangeBy{
		Min:   "(" + q,
		Max:   "(" + nextPrefix(q),
		Count: 1000,
	}).Result()
	var scored []*scoredCompletion
	for _, entry := range entries {
		c, err := complete.Decode(entry)
		if err != nil {
			return nil, err
		}
		offset := len(strings.Split(entry, "/"))
		s := &scoredCompletion{
			c: c,
			// Weight importers by distance of the matching text from the end of the
			// import path. This is done in an attempt to make results more relevant
			// the closer the match is to the end of the import path. For example, if
			// the user types 'net', we should have some preference for 'net' over
			// 'net/http'. In this case, it actually works out like so:
			//  - net has ~68000 importers
			//  - net/http has ~130000 importers
			//
			// So the score of 'net' is ~68000 (offset=1), and the score of
			// 'net/http' is ~65000 (130K/2, as offset=2), therefore net should be
			// sorted above 'net/http' in the results.
			//
			// This heuristic is a total guess, but since this is just autocomplete
			// it probably doesn't matter much. In testing, it felt like autocomplete
			// was completing the packages I wanted.
			//
			// The `- offset` term is added to break ties in the case where all
			// completion results have 0 importers.
			score: c.Importers/offset - offset,
		}
		scored = append(scored, s)
	}
	// sort by score descending
	sort.Slice(scored, func(i, j int) bool {
		return scored[i].score > scored[j].score
	})
	if len(scored) > maxResults {
		scored = scored[:maxResults]
	}
	return scored, nil
}

// nextPrefix returns the first string (according to lexical sorting) that is
// greater than prefix but does not start with prefix.
func nextPrefix(prefix string) string {
	// redis strings are ASCII. Note that among printing ASCII characters '!' has
	// the smallest byte value and '~' has the largest byte value. It also so
	// happens that these are both valid characters in a URL.
	if prefix == "" {
		return ""
	}
	lastChar := prefix[len(prefix)-1]
	if lastChar >= '~' {
		// If the last character is '~', there is no greater ascii character so we
		// must move to the previous character to find a lexically greater string
		// that doesn't start with prefix. Note that in the degenerate case where
		// prefix is nothing but twiddles (e.g. "~~~"), we will recurse until we return "",
		// which is acceptable: there is no prefix that satisfies our requirements:
		// all strings greater than "~~~" must also start with "~~~"
		return nextPrefix(prefix[:len(prefix)-1])
	}
	return prefix[:len(prefix)-1] + string(lastChar+1)
}
internal/frontend: add an autocompletion endpoint Now that completion data is being populated by the ETL, we can use it to provide a JSON API for search autocompletions at /autocomplete?q=query. This API reads lexical completions from Redis, sorts according to a scoring heuristic, and returns the results as a JSON array. For testing purposes, the pathCompletions function is moved from the etl package to the complete package, and exported. Updates b/143370178 Change-Id: I61d75296399222fa9ebfe8aadc909bf048389a23 Reviewed-on: https://team-review.git.corp.google.com/c/golang/discovery/+/602064 Reviewed-by: Jonathan Amsterdam <jba@google.com> 2019-11-19 21:53:35 +03:00			`// Copyright 2019 The Go Authors. All rights reserved.`
			`// Use of this source code is governed by a BSD-style`
			`// license that can be found in the LICENSE file.`

			`package frontend`

			`import (`
			`"bytes"`
			`"context"`
			`"encoding/json"`
			`"io"`
			`"net/http"`
			`"sort"`
			`"strings"`

			`"github.com/go-redis/redis/v7"`
all: rename module to golang.org/x/pkgsite golang.org/x/discovery is renamed to golang.org/x/pkgsite. When the repository is open sourced, it will be hosted at go.googlesource.com/pkgsite. Change-Id: Ifc3b45b771a385b99179e785447f2a87afcacf87 Reviewed-on: https://team-review.git.corp.google.com/c/golang/discovery/+/724273 Reviewed-by: Jonathan Amsterdam <jba@google.com> 2020-04-21 23:51:29 +03:00			`"golang.org/x/pkgsite/internal/complete"`
			`"golang.org/x/pkgsite/internal/derrors"`
			`"golang.org/x/pkgsite/internal/log"`
internal/frontend: add an autocompletion endpoint Now that completion data is being populated by the ETL, we can use it to provide a JSON API for search autocompletions at /autocomplete?q=query. This API reads lexical completions from Redis, sorts according to a scoring heuristic, and returns the results as a JSON array. For testing purposes, the pathCompletions function is moved from the etl package to the complete package, and exported. Updates b/143370178 Change-Id: I61d75296399222fa9ebfe8aadc909bf048389a23 Reviewed-on: https://team-review.git.corp.google.com/c/golang/discovery/+/602064 Reviewed-by: Jonathan Amsterdam <jba@google.com> 2019-11-19 21:53:35 +03:00			`)`

			`// handleAutoCompletion handles requests for /autocomplete?q=<input prefix>, by`
			`// querying redis sorted sets indexing package paths.`
			`func (s Server) handleAutoCompletion(w http.ResponseWriter, r http.Request) {`
internal/{log,middleware}: add trace ID to context, use in logging - The requestlog middleware gets the trace ID from the request and adds it to the request's context. - The internal/log package retrieves the traceID and adds it to log messages. It also creates two logs instead of one, a "parent" log used in requestlog, and a "child" log used for all other logging. Together, these two changes will cause the Stackdriver log viewer to group all log messages for a request with the request start and end log messages. - These changes require that all log functions take a context, so I plumbed one through everywhere. In a handful of cases it didn't seem worth doing the plumbing. I used context.TODO() for those so we can easily find and re-evalaute them. Change-Id: I663588463520187d0549a8f802ba9cb44a893592 Reviewed-on: https://team-review.git.corp.google.com/c/golang/discovery/+/622940 Reviewed-by: Robert Findley <rfindley@google.com> 2019-12-18 16:38:16 +03:00			`ctx := r.Context()`
internal/frontend: add an autocompletion endpoint Now that completion data is being populated by the ETL, we can use it to provide a JSON API for search autocompletions at /autocomplete?q=query. This API reads lexical completions from Redis, sorts according to a scoring heuristic, and returns the results as a JSON array. For testing purposes, the pathCompletions function is moved from the etl package to the complete package, and exported. Updates b/143370178 Change-Id: I61d75296399222fa9ebfe8aadc909bf048389a23 Reviewed-on: https://team-review.git.corp.google.com/c/golang/discovery/+/602064 Reviewed-by: Jonathan Amsterdam <jba@google.com> 2019-11-19 21:53:35 +03:00			`var completions []*complete.Completion`
			`if s.cmplClient != nil {`
			`var err error`
			`q := r.FormValue("q")`
			`completions, err = doCompletion(r.Context(), s.cmplClient, strings.ToLower(q), 5)`
			`if err != nil {`
			`code := http.StatusInternalServerError`
			`http.Error(w, http.StatusText(code), code)`
internal/frontend: always return a JSON array for autocomplete null was being returned when completion results are empty, which causes autocomplete.js to generate errors. In order to test this, hook up an HA redis to the integration test and populate auto-completions. Updates b/143370178 Change-Id: Ice30c7a6f9f3d4907746103193947bbd5dc5a45b Reviewed-on: https://team-review.git.corp.google.com/c/golang/discovery/+/607126 CI-Result: Cloud Build <devtools-proctor-result-processor@system.gserviceaccount.com> Reviewed-by: Jonathan Amsterdam <jba@google.com> 2019-11-26 19:05:33 +03:00			`return`
internal/frontend: add an autocompletion endpoint Now that completion data is being populated by the ETL, we can use it to provide a JSON API for search autocompletions at /autocomplete?q=query. This API reads lexical completions from Redis, sorts according to a scoring heuristic, and returns the results as a JSON array. For testing purposes, the pathCompletions function is moved from the etl package to the complete package, and exported. Updates b/143370178 Change-Id: I61d75296399222fa9ebfe8aadc909bf048389a23 Reviewed-on: https://team-review.git.corp.google.com/c/golang/discovery/+/602064 Reviewed-by: Jonathan Amsterdam <jba@google.com> 2019-11-19 21:53:35 +03:00			`}`
internal/frontend: always return a JSON array for autocomplete null was being returned when completion results are empty, which causes autocomplete.js to generate errors. In order to test this, hook up an HA redis to the integration test and populate auto-completions. Updates b/143370178 Change-Id: Ice30c7a6f9f3d4907746103193947bbd5dc5a45b Reviewed-on: https://team-review.git.corp.google.com/c/golang/discovery/+/607126 CI-Result: Cloud Build <devtools-proctor-result-processor@system.gserviceaccount.com> Reviewed-by: Jonathan Amsterdam <jba@google.com> 2019-11-26 19:05:33 +03:00			`}`
			`if completions == nil {`
internal/frontend: add an autocompletion endpoint Now that completion data is being populated by the ETL, we can use it to provide a JSON API for search autocompletions at /autocomplete?q=query. This API reads lexical completions from Redis, sorts according to a scoring heuristic, and returns the results as a JSON array. For testing purposes, the pathCompletions function is moved from the etl package to the complete package, and exported. Updates b/143370178 Change-Id: I61d75296399222fa9ebfe8aadc909bf048389a23 Reviewed-on: https://team-review.git.corp.google.com/c/golang/discovery/+/602064 Reviewed-by: Jonathan Amsterdam <jba@google.com> 2019-11-19 21:53:35 +03:00			`// autocomplete.js complains if the JSON returned by this endpoint is null,`
			`// so we initialize a non-nil empty array to serialize to an empty JSON`
			`// array.`
			`completions = []*complete.Completion{}`
			`}`
			`response, err := json.Marshal(completions)`
			`if err != nil {`
internal/{log,middleware}: add trace ID to context, use in logging - The requestlog middleware gets the trace ID from the request and adds it to the request's context. - The internal/log package retrieves the traceID and adds it to log messages. It also creates two logs instead of one, a "parent" log used in requestlog, and a "child" log used for all other logging. Together, these two changes will cause the Stackdriver log viewer to group all log messages for a request with the request start and end log messages. - These changes require that all log functions take a context, so I plumbed one through everywhere. In a handful of cases it didn't seem worth doing the plumbing. I used context.TODO() for those so we can easily find and re-evalaute them. Change-Id: I663588463520187d0549a8f802ba9cb44a893592 Reviewed-on: https://team-review.git.corp.google.com/c/golang/discovery/+/622940 Reviewed-by: Robert Findley <rfindley@google.com> 2019-12-18 16:38:16 +03:00			`log.Errorf(ctx, "error marshalling completion: json.Marshal: %v", err)`
internal/frontend: add an autocompletion endpoint Now that completion data is being populated by the ETL, we can use it to provide a JSON API for search autocompletions at /autocomplete?q=query. This API reads lexical completions from Redis, sorts according to a scoring heuristic, and returns the results as a JSON array. For testing purposes, the pathCompletions function is moved from the etl package to the complete package, and exported. Updates b/143370178 Change-Id: I61d75296399222fa9ebfe8aadc909bf048389a23 Reviewed-on: https://team-review.git.corp.google.com/c/golang/discovery/+/602064 Reviewed-by: Jonathan Amsterdam <jba@google.com> 2019-11-19 21:53:35 +03:00			`}`
			`w.Header().Set("Content-Type", "application/json")`
			`if _, err := io.Copy(w, bytes.NewReader(response)); err != nil {`
internal/{log,middleware}: add trace ID to context, use in logging - The requestlog middleware gets the trace ID from the request and adds it to the request's context. - The internal/log package retrieves the traceID and adds it to log messages. It also creates two logs instead of one, a "parent" log used in requestlog, and a "child" log used for all other logging. Together, these two changes will cause the Stackdriver log viewer to group all log messages for a request with the request start and end log messages. - These changes require that all log functions take a context, so I plumbed one through everywhere. In a handful of cases it didn't seem worth doing the plumbing. I used context.TODO() for those so we can easily find and re-evalaute them. Change-Id: I663588463520187d0549a8f802ba9cb44a893592 Reviewed-on: https://team-review.git.corp.google.com/c/golang/discovery/+/622940 Reviewed-by: Robert Findley <rfindley@google.com> 2019-12-18 16:38:16 +03:00			`log.Errorf(ctx, "Error copying json buffer to ResponseWriter: %v", err)`
internal/frontend: add an autocompletion endpoint Now that completion data is being populated by the ETL, we can use it to provide a JSON API for search autocompletions at /autocomplete?q=query. This API reads lexical completions from Redis, sorts according to a scoring heuristic, and returns the results as a JSON array. For testing purposes, the pathCompletions function is moved from the etl package to the complete package, and exported. Updates b/143370178 Change-Id: I61d75296399222fa9ebfe8aadc909bf048389a23 Reviewed-on: https://team-review.git.corp.google.com/c/golang/discovery/+/602064 Reviewed-by: Jonathan Amsterdam <jba@google.com> 2019-11-19 21:53:35 +03:00			`}`
			`}`

			`// scoredCompletion wraps Completions with a relevancy score, so that they can`
			`// be sorted.`
			`type scoredCompletion struct {`
			`c *complete.Completion`
			`score int`
			`}`

			`// doCompletion executes the completion query against redis. This is inspired`
			`// by http://oldblog.antirez.com/post/autocomplete-with-redis.html, but`
			`// improved as follows:`
			`// + Use ZRANGEBYLEX to avoid storing each possible prefix, since that was`
			`// added to Redis since the original blog post.`
			`// + Use an additional sorted set that holds popular packages, to improve`
			`// completion relevancy.`
			`//`
			`// We autocomplete the query 'q' as follows`
			`// 1. Query for popular completions starting with q using ZRANGEBYLEX (more`
			`// details on this below). We fetch an arbitrary number of results (1000)`
			`// to bound the amount of work done by redis.`
			`// 2. Sort the returned completions by our score (a mix of popularity and`
			`// proximity to the end of the import path), and filter to the top`
			`// maxResults.`
			`// 3. If we have maxResults results, we're done. Otherwise do (1) on the index`
			`// of remaining (unpopular) package paths, add to our result set, and sort`
			`// again (because unpopular packages might actually score higher than`
			`// popular packages).`
			`func doCompletion(ctx context.Context, r redis.Client, q string, maxResults int) (_ []complete.Completion, err error) {`
			`defer derrors.Wrap(&err, "doCompletion(%q, %d)", q, maxResults)`
			`scored, err := completeWithIndex(ctx, r, q, complete.PopularKey, maxResults)`
			`if err != nil {`
			`return nil, err`
			`}`
			`if len(scored) < maxResults {`
			`unpopular, err := completeWithIndex(ctx, r, q, complete.RemainingKey, maxResults-len(scored))`
			`if err != nil {`
			`return nil, err`
			`}`
			`scored = append(scored, unpopular...)`
			`// Re-sort, as it is possible that an unpopular completion actually has a`
			`// higher score than a popular completion due to the weighting for suffix`
			`// length.`
			`sort.Slice(scored, func(i, j int) bool {`
			`return scored[i].score > scored[j].score`
			`})`
			`}`
			`var completions []*complete.Completion`
			`for _, s := range scored {`
			`completions = append(completions, s.c)`
			`}`
			`return completions, nil`
			`}`

			`func completeWithIndex(ctx context.Context, r redis.Client, q, indexKey string, maxResults int) (_ []scoredCompletion, err error) {`
			`defer derrors.Wrap(&err, "completeWithIndex(%q, %q, %d)", q, indexKey, maxResults)`

			`// Query for possible completions using ZRANGEBYLEX. See documentation at`
			`// https://redis.io/commands/zrangebylex`
			`// Notably, the "(" character in the Min and Max fields means 'exclude this`
			`// endpoint'.`
			`// We bound our search in two ways: (1) by setting Max to the smallest string`
			`// that lexically greater than q but does not start with q, and (2) by`
			`// setting an arbitrary limit of 1000 results.`
			`entries, err := r.WithContext(ctx).ZRangeByLex(indexKey, &redis.ZRangeBy{`
			`Min: "(" + q,`
			`Max: "(" + nextPrefix(q),`
			`Count: 1000,`
			`}).Result()`
			`var scored []*scoredCompletion`
			`for _, entry := range entries {`
			`c, err := complete.Decode(entry)`
			`if err != nil {`
			`return nil, err`
			`}`
			`offset := len(strings.Split(entry, "/"))`
			`s := &scoredCompletion{`
			`c: c,`
			`// Weight importers by distance of the matching text from the end of the`
			`// import path. This is done in an attempt to make results more relevant`
			`// the closer the match is to the end of the import path. For example, if`
			`// the user types 'net', we should have some preference for 'net' over`
			`// 'net/http'. In this case, it actually works out like so:`
			`// - net has ~68000 importers`
			`// - net/http has ~130000 importers`
			`//`
			`// So the score of 'net' is ~68000 (offset=1), and the score of`
			`// 'net/http' is ~65000 (130K/2, as offset=2), therefore net should be`
			`// sorted above 'net/http' in the results.`
			`//`
			`// This heuristic is a total guess, but since this is just autocomplete`
			`// it probably doesn't matter much. In testing, it felt like autocomplete`
			`// was completing the packages I wanted.`
			`//`
			// The `- offset` term is added to break ties in the case where all
			`// completion results have 0 importers.`
			`score: c.Importers/offset - offset,`
			`}`
			`scored = append(scored, s)`
			`}`
			`// sort by score descending`
			`sort.Slice(scored, func(i, j int) bool {`
			`return scored[i].score > scored[j].score`
			`})`
			`if len(scored) > maxResults {`
			`scored = scored[:maxResults]`
			`}`
			`return scored, nil`
			`}`

			`// nextPrefix returns the first string (according to lexical sorting) that is`
			`// greater than prefix but does not start with prefix.`
			`func nextPrefix(prefix string) string {`
			`// redis strings are ASCII. Note that among printing ASCII characters '!' has`
			`// the smallest byte value and '~' has the largest byte value. It also so`
			`// happens that these are both valid characters in a URL.`
			`if prefix == "" {`
			`return ""`
			`}`
			`lastChar := prefix[len(prefix)-1]`
			`if lastChar >= '~' {`
			`// If the last character is '~', there is no greater ascii character so we`
			`// must move to the previous character to find a lexically greater string`
			`// that doesn't start with prefix. Note that in the degenerate case where`
			`// prefix is nothing but twiddles (e.g. "~~~"), we will recurse until we return "",`
			`// which is acceptable: there is no prefix that satisfies our requirements:`
			`// all strings greater than "~~~" must also start with "~~~"`
			`return nextPrefix(prefix[:len(prefix)-1])`
			`}`
			`return prefix[:len(prefix)-1] + string(lastChar+1)`
			`}`