A package will be removed if either it has no commits in two years and
no imports from other packages, or it is created for a quick bug fix,
which means it has one or two commits within a week of creation time and
no other activity since then.

This CL checks such packages from GitHub and BitBucket utilizing their
API to gather commits information.

This CL also removes the checks for references for GitHub repo, instead
it check the most recent commit from default branch.

This fixes #405

Change-Id: I14b0f0133f31851511aaa63eee8acbfba63e13d2
Reviewed-on: https://go-review.googlesource.com/24513
Reviewed-by: Alan Donovan <adonovan@google.com>
This commit is contained in:
Tuo Shan 2016-06-29 17:29:32 -04:00 коммит произвёл Alan Donovan
Родитель f7243c4f72
Коммит 7f31cf313c
11 изменённых файлов: 144 добавлений и 80 удалений

Просмотреть файл

@ -601,6 +601,21 @@ var deleteScript = redis.NewScript(0, `
func (db *Database) Delete(path string) error {
c := db.Pool.Get()
defer c.Close()
if GAESearch {
ctx := bgCtx()
id, err := redis.String(c.Do("HGET", "ids", path))
if err == redis.ErrNil {
return nil
}
if err != nil {
return err
}
if err := deleteIndex(ctx, id); err != nil {
return err
}
}
_, err := deleteScript.Do(c, path)
return err
}

Просмотреть файл

@ -172,6 +172,14 @@ func isTermSep2(r rune) bool {
unicode.IsSymbol(r)
}
func deleteIndex(c context.Context, id string) error {
idx, err := search.Open("packages")
if err != nil {
return err
}
return idx.Delete(c, id)
}
// PurgeIndex deletes all the packages from the search index.
func PurgeIndex(c context.Context) error {
idx, err := search.Open("packages")

Просмотреть файл

@ -77,6 +77,7 @@ func TestPutIndexNewPackageAndUpdate(t *testing.T) {
ImportCount: 1,
Fork: true,
Stars: 10,
Score: 0.99,
}
if got != wanted {
t.Errorf("PutIndex got %v, want %v", got, wanted)

Просмотреть файл

@ -61,7 +61,7 @@ func crawlDoc(source string, importPath string, pdoc *doc.Package, hasSubdirs bo
}
pdoc = nil
err = gosrc.NotFoundError{Message: "no Go files or subdirs"}
} else if err != gosrc.ErrNotModified {
} else if _, ok := err.(gosrc.NotModifiedError); !ok {
pdoc = pdocNew
}
}
@ -75,27 +75,54 @@ func crawlDoc(source string, importPath string, pdoc *doc.Package, hasSubdirs bo
nextCrawl = start.Add(*maxAge * 30)
}
switch {
case err == nil:
if err == nil {
message = append(message, "put:", pdoc.Etag)
if err := db.Put(pdoc, nextCrawl, false); err != nil {
log.Printf("ERROR db.Put(%q): %v", importPath, err)
}
return pdoc, nil
case err == gosrc.ErrNotModified:
} else if e, ok := err.(gosrc.NotModifiedError); ok {
if !pdoc.IsCmd && isInactivePkg(importPath, e.Since) {
message = append(message, "delete inactive")
if err := db.Delete(importPath); err != nil {
log.Printf("ERROR db.Delete(%q): %v", importPath, err)
}
return nil, e
}
// Touch the package without updating and move on to next one.
message = append(message, "touch")
if err := db.SetNextCrawlEtag(pdoc.ProjectRoot, pdoc.Etag, nextCrawl); err != nil {
log.Printf("ERROR db.SetNextCrawlEtag(%q): %v", importPath, err)
}
return pdoc, nil
case gosrc.IsNotFound(err):
message = append(message, "notfound:", err)
} else if err == gosrc.ErrQuickFork {
message = append(message, "delete", err)
if err := db.Delete(importPath); err != nil {
log.Printf("ERROR db.Delete(%q): %v", importPath, err)
}
return nil, err
default:
} else if e, ok := err.(gosrc.NotFoundError); ok {
message = append(message, "notfound:", e)
if err := db.Delete(importPath); err != nil {
log.Printf("ERROR db.Delete(%q): %v", importPath, err)
}
return nil, e
} else {
message = append(message, "ERROR:", err)
return nil, err
}
}
// isInactivePkg reports whether the specified package is not imported
// and has not been modified in 2 years.
func isInactivePkg(pkg string, lastCommitted time.Time) bool {
if lastCommitted.IsZero() ||
time.Now().Before(lastCommitted.Add(2*365*24*time.Hour)) {
return false
}
n, err := db.ImporterCount(pkg)
if err != nil {
log.Printf("ERROR db.ImporterCount(%q): %v", pkg, err)
}
return n == 0
}

Просмотреть файл

@ -7,6 +7,7 @@
package gosrc
import (
"log"
"net/http"
"path"
"regexp"
@ -34,6 +35,11 @@ type bitbucketRepo struct {
IsFork bool `json:"is_fork"`
}
type bitbucketNode struct {
Node string `json:"node"`
Timestamp string `json:"utctimestamp"`
}
func getBitbucketDir(client *http.Client, match map[string]string, savedEtag string) (*Directory, error) {
var repo *bitbucketRepo
c := &httpClient{client: client}
@ -50,27 +56,35 @@ func getBitbucketDir(client *http.Client, match map[string]string, savedEtag str
}
tags := make(map[string]string)
timestamps := make(map[string]time.Time)
for _, nodeType := range []string{"branches", "tags"} {
var nodes map[string]struct {
Node string
}
var nodes map[string]bitbucketNode
if _, err := c.getJSON(expand("https://api.bitbucket.org/1.0/repositories/{owner}/{repo}/{0}", match, nodeType), &nodes); err != nil {
return nil, err
}
for t, n := range nodes {
tags[t] = n.Node
const timeFormat = "2006-01-02 15:04:05Z07:00"
committed, err := time.Parse(timeFormat, n.Timestamp)
if err != nil {
log.Println("error parsing timestamp:", n.Timestamp)
continue
}
timestamps[t] = committed
}
}
var err error
match["tag"], match["commit"], err = bestTag(tags, defaultTags[match["vcs"]])
tag, commit, err := bestTag(tags, defaultTags[match["vcs"]])
if err != nil {
return nil, err
}
match["tag"] = tag
match["commit"] = commit
etag := expand("{vcs}-{commit}", match)
if etag == savedEtag {
return nil, ErrNotModified
return nil, NotModifiedError{Since: timestamps[tag]}
}
if repo == nil {

Просмотреть файл

@ -38,6 +38,15 @@ var (
ownerRepoPat = regexp.MustCompile(`^https://api.github.com/repos/([^/]+)/([^/]+)/`)
)
type githubCommit struct {
ID string `json:"sha"`
Commit struct {
Committer struct {
Date time.Time `json:"date"`
} `json:"committer"`
} `json:"commit"`
}
func gitHubError(resp *http.Response) error {
var e struct {
Message string `json:"message"`
@ -52,53 +61,34 @@ func getGitHubDir(client *http.Client, match map[string]string, savedEtag string
c := &httpClient{client: client, errFn: gitHubError}
type refJSON struct {
Object struct {
Type string
Sha string
URL string
}
Ref string
URL string
var repo struct {
Fork bool `json:"fork"`
Stars int `json:"stargazers_count"`
CreatedAt time.Time `json:"created_at"`
PushedAt time.Time `json:"pushed_at"`
}
var refs []*refJSON
resp, err := c.getJSON(expand("https://api.github.com/repos/{owner}/{repo}/git/refs", match), &refs)
if err != nil {
if _, err := c.getJSON(expand("https://api.github.com/repos/{owner}/{repo}", match), &repo); err != nil {
return nil, err
}
// If the response contains a Link header, then fallback to requesting "master" and "go1" by name.
if resp.Header.Get("Link") != "" {
var masterRef refJSON
if _, err := c.getJSON(expand("https://api.github.com/repos/{owner}/{repo}/git/refs/heads/master", match), &masterRef); err == nil {
refs = append(refs, &masterRef)
}
var go1Ref refJSON
if _, err := c.getJSON(expand("https://api.github.com/repos/{owner}/{repo}/git/refs/tags/go1", match), &go1Ref); err == nil {
refs = append(refs, &go1Ref)
}
var commits []*githubCommit
url := expand("https://api.github.com/repos/{owner}/{repo}/commits", match)
url += fmt.Sprintf("?since=%s", repo.CreatedAt.Format(time.RFC3339))
if match["dir"] != "" {
url += fmt.Sprintf("&path=%s", match["dir"])
}
tags := make(map[string]string)
for _, ref := range refs {
switch {
case strings.HasPrefix(ref.Ref, "refs/heads/"):
tags[ref.Ref[len("refs/heads/"):]] = ref.Object.Sha
case strings.HasPrefix(ref.Ref, "refs/tags/"):
tags[ref.Ref[len("refs/tags/"):]] = ref.Object.Sha
}
}
var commit string
match["tag"], commit, err = bestTag(tags, "master")
if err != nil {
if _, err := c.getJSON(url, &commits); err != nil {
return nil, err
}
if commit == savedEtag {
return nil, ErrNotModified
if repo.Fork && isQuickFork(commits, repo.CreatedAt) {
return nil, ErrQuickFork
}
if len(commits) == 0 {
return nil, NotFoundError{Message: "package directory changed or removed"}
}
if commits[0].ID == savedEtag {
return nil, NotModifiedError{Since: commits[0].Commit.Committer.Date}
}
var contents []*struct {
@ -108,7 +98,7 @@ func getGitHubDir(client *http.Client, match map[string]string, savedEtag string
HTMLURL string `json:"html_url"`
}
if _, err := c.getJSON(expand("https://api.github.com/repos/{owner}/{repo}/contents{dir}?ref={tag}", match), &contents); err != nil {
if _, err := c.getJSON(expand("https://api.github.com/repos/{owner}/{repo}/contents{dir}", match), &contents); err != nil {
// The GitHub content API returns array values for directories
// and object values for files. If there's a type mismatch at
// the beginning of the response, then assume that the path is
@ -157,25 +147,14 @@ func getGitHubDir(client *http.Client, match map[string]string, savedEtag string
browseURL := expand("https://github.com/{owner}/{repo}", match)
if match["dir"] != "" {
browseURL = expand("https://github.com/{owner}/{repo}/tree/{tag}{dir}", match)
}
var repo = struct {
Fork bool `json:"fork"`
Stars int `json:"stargazers_count"`
CreatedAt time.Time `json:"created_at"`
PushedAt time.Time `json:"pushed_at"`
}{}
if _, err := c.getJSON(expand("https://api.github.com/repos/{owner}/{repo}", match), &repo); err != nil {
return nil, err
browseURL = expand("https://github.com/{owner}/{repo}/tree{dir}", match)
}
isDeadEndFork := repo.Fork && repo.PushedAt.Before(repo.CreatedAt)
return &Directory{
BrowseURL: browseURL,
Etag: commit,
Etag: commits[0].ID,
Files: files,
LineFmt: "%s#L%d",
ProjectName: match["repo"],
@ -189,6 +168,24 @@ func getGitHubDir(client *http.Client, match map[string]string, savedEtag string
}, nil
}
// isQuickFork reports whether the repository is a "quick fork":
// it has fewer than 3 commits, all within a week of the repo creation, createdAt.
func isQuickFork(commits []*githubCommit, createdAt time.Time) bool {
if len(commits) > 2 {
return false
}
oneWeekOld := createdAt.Add(7 * 24 * time.Hour)
if oneWeekOld.After(time.Now()) {
return false // a newborn baby of a repository
}
for _, commit := range commits {
if commit.Commit.Committer.Date.After(oneWeekOld) {
return false
}
}
return true
}
func getGitHubPresentation(client *http.Client, match map[string]string) (*Presentation, error) {
c := &httpClient{client: client, header: gitHubRawHeader}
@ -310,7 +307,7 @@ func getGistDir(client *http.Client, match map[string]string, savedEtag string)
commit := gist.History[0].Version
if commit == savedEtag {
return nil, ErrNotModified
return nil, NotModifiedError{}
}
var files []*File

Просмотреть файл

@ -23,12 +23,6 @@ func getStandardDir(client *http.Client, importPath string, savedEtag string) (*
browseURL := "https://golang.org/src/" + importPath + "/"
p, err := c.getBytes(browseURL)
if IsNotFound(err) {
// Fallback to Go 1.3 directory structure.
// TODO(garyburd): Delete fallback after 1.4 is pushed to golang.org.
browseURL = "https://golang.org/src/pkg/" + importPath + "/"
p, err = c.getBytes(browseURL)
}
if err != nil {
return nil, err
}
@ -40,7 +34,7 @@ func getStandardDir(client *http.Client, importPath string, savedEtag string) (*
}
etag = strings.Trim(string(m[1]), ". ")
if etag == savedEtag {
return nil, ErrNotModified
return nil, NotModifiedError{}
}
var files []*File

Просмотреть файл

@ -75,7 +75,7 @@ func getGoogleDir(client *http.Client, match map[string]string, savedEtag string
}
etag = expand("{vcs}-{0}", match, string(m[1]))
if etag == savedEtag {
return nil, ErrNotModified
return nil, NotModifiedError{}
}
var subdirs []string

Просмотреть файл

@ -16,6 +16,7 @@ import (
"path"
"regexp"
"strings"
"time"
)
// File represents a file.
@ -112,8 +113,15 @@ func (e *RemoteError) Error() string {
return e.err.Error()
}
// ErrNotModified indicates that the directory matches the specified etag.
var ErrNotModified = errors.New("package not modified")
type NotModifiedError struct {
Since time.Time
}
func (e NotModifiedError) Error() string {
return fmt.Sprintf("package not modified since %s", e.Since.Format(time.RFC1123))
}
var ErrQuickFork = errors.New("package is a quick bug-fix fork")
var errNoMatch = errors.New("no match")

Просмотреть файл

@ -119,7 +119,7 @@ func getLaunchpadDir(client *http.Client, match map[string]string, savedEtag str
hash = m.Sum(hash[:0])
etag := hex.EncodeToString(hash)
if etag == savedEtag {
return nil, ErrNotModified
return nil, NotModifiedError{}
}
return &Directory{

Просмотреть файл

@ -148,7 +148,7 @@ func downloadGit(schemes []string, clonePath, repo, savedEtag string) (string, s
etag := scheme + "-" + commit
if etag == savedEtag {
return "", "", ErrNotModified
return "", "", NotModifiedError{}
}
dir := filepath.Join(TempDir, repo+".git")
@ -201,7 +201,7 @@ func downloadSVN(schemes []string, clonePath, repo, savedEtag string) (string, s
etag := scheme + "-" + revno
if etag == savedEtag {
return "", "", ErrNotModified
return "", "", NotModifiedError{}
}
dir := filepath.Join(TempDir, repo+".svn")