зеркало из https://github.com/golang/gddo.git
gosrc: Remove noise packages.
A package will be removed if either it has no commits in two years and no imports from other packages, or it is created for a quick bug fix, which means it has one or two commits within a week of creation time and no other activity since then. This CL checks such packages from GitHub and BitBucket utilizing their API to gather commits information. This CL also removes the checks for references for GitHub repo, instead it check the most recent commit from default branch. This fixes #405 Change-Id: I14b0f0133f31851511aaa63eee8acbfba63e13d2 Reviewed-on: https://go-review.googlesource.com/24513 Reviewed-by: Alan Donovan <adonovan@google.com>
This commit is contained in:
Родитель
f7243c4f72
Коммит
7f31cf313c
|
@ -601,6 +601,21 @@ var deleteScript = redis.NewScript(0, `
|
|||
func (db *Database) Delete(path string) error {
|
||||
c := db.Pool.Get()
|
||||
defer c.Close()
|
||||
|
||||
if GAESearch {
|
||||
ctx := bgCtx()
|
||||
id, err := redis.String(c.Do("HGET", "ids", path))
|
||||
if err == redis.ErrNil {
|
||||
return nil
|
||||
}
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
if err := deleteIndex(ctx, id); err != nil {
|
||||
return err
|
||||
}
|
||||
}
|
||||
|
||||
_, err := deleteScript.Do(c, path)
|
||||
return err
|
||||
}
|
||||
|
|
|
@ -172,6 +172,14 @@ func isTermSep2(r rune) bool {
|
|||
unicode.IsSymbol(r)
|
||||
}
|
||||
|
||||
func deleteIndex(c context.Context, id string) error {
|
||||
idx, err := search.Open("packages")
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
return idx.Delete(c, id)
|
||||
}
|
||||
|
||||
// PurgeIndex deletes all the packages from the search index.
|
||||
func PurgeIndex(c context.Context) error {
|
||||
idx, err := search.Open("packages")
|
||||
|
|
|
@ -77,6 +77,7 @@ func TestPutIndexNewPackageAndUpdate(t *testing.T) {
|
|||
ImportCount: 1,
|
||||
Fork: true,
|
||||
Stars: 10,
|
||||
Score: 0.99,
|
||||
}
|
||||
if got != wanted {
|
||||
t.Errorf("PutIndex got %v, want %v", got, wanted)
|
||||
|
|
|
@ -61,7 +61,7 @@ func crawlDoc(source string, importPath string, pdoc *doc.Package, hasSubdirs bo
|
|||
}
|
||||
pdoc = nil
|
||||
err = gosrc.NotFoundError{Message: "no Go files or subdirs"}
|
||||
} else if err != gosrc.ErrNotModified {
|
||||
} else if _, ok := err.(gosrc.NotModifiedError); !ok {
|
||||
pdoc = pdocNew
|
||||
}
|
||||
}
|
||||
|
@ -75,27 +75,54 @@ func crawlDoc(source string, importPath string, pdoc *doc.Package, hasSubdirs bo
|
|||
nextCrawl = start.Add(*maxAge * 30)
|
||||
}
|
||||
|
||||
switch {
|
||||
case err == nil:
|
||||
if err == nil {
|
||||
message = append(message, "put:", pdoc.Etag)
|
||||
if err := db.Put(pdoc, nextCrawl, false); err != nil {
|
||||
log.Printf("ERROR db.Put(%q): %v", importPath, err)
|
||||
}
|
||||
return pdoc, nil
|
||||
case err == gosrc.ErrNotModified:
|
||||
} else if e, ok := err.(gosrc.NotModifiedError); ok {
|
||||
if !pdoc.IsCmd && isInactivePkg(importPath, e.Since) {
|
||||
message = append(message, "delete inactive")
|
||||
if err := db.Delete(importPath); err != nil {
|
||||
log.Printf("ERROR db.Delete(%q): %v", importPath, err)
|
||||
}
|
||||
return nil, e
|
||||
}
|
||||
// Touch the package without updating and move on to next one.
|
||||
message = append(message, "touch")
|
||||
if err := db.SetNextCrawlEtag(pdoc.ProjectRoot, pdoc.Etag, nextCrawl); err != nil {
|
||||
log.Printf("ERROR db.SetNextCrawlEtag(%q): %v", importPath, err)
|
||||
}
|
||||
return pdoc, nil
|
||||
case gosrc.IsNotFound(err):
|
||||
message = append(message, "notfound:", err)
|
||||
} else if err == gosrc.ErrQuickFork {
|
||||
message = append(message, "delete", err)
|
||||
if err := db.Delete(importPath); err != nil {
|
||||
log.Printf("ERROR db.Delete(%q): %v", importPath, err)
|
||||
}
|
||||
return nil, err
|
||||
default:
|
||||
} else if e, ok := err.(gosrc.NotFoundError); ok {
|
||||
message = append(message, "notfound:", e)
|
||||
if err := db.Delete(importPath); err != nil {
|
||||
log.Printf("ERROR db.Delete(%q): %v", importPath, err)
|
||||
}
|
||||
return nil, e
|
||||
} else {
|
||||
message = append(message, "ERROR:", err)
|
||||
return nil, err
|
||||
}
|
||||
}
|
||||
|
||||
// isInactivePkg reports whether the specified package is not imported
|
||||
// and has not been modified in 2 years.
|
||||
func isInactivePkg(pkg string, lastCommitted time.Time) bool {
|
||||
if lastCommitted.IsZero() ||
|
||||
time.Now().Before(lastCommitted.Add(2*365*24*time.Hour)) {
|
||||
return false
|
||||
}
|
||||
n, err := db.ImporterCount(pkg)
|
||||
if err != nil {
|
||||
log.Printf("ERROR db.ImporterCount(%q): %v", pkg, err)
|
||||
}
|
||||
return n == 0
|
||||
}
|
||||
|
|
|
@ -7,6 +7,7 @@
|
|||
package gosrc
|
||||
|
||||
import (
|
||||
"log"
|
||||
"net/http"
|
||||
"path"
|
||||
"regexp"
|
||||
|
@ -34,6 +35,11 @@ type bitbucketRepo struct {
|
|||
IsFork bool `json:"is_fork"`
|
||||
}
|
||||
|
||||
type bitbucketNode struct {
|
||||
Node string `json:"node"`
|
||||
Timestamp string `json:"utctimestamp"`
|
||||
}
|
||||
|
||||
func getBitbucketDir(client *http.Client, match map[string]string, savedEtag string) (*Directory, error) {
|
||||
var repo *bitbucketRepo
|
||||
c := &httpClient{client: client}
|
||||
|
@ -50,27 +56,35 @@ func getBitbucketDir(client *http.Client, match map[string]string, savedEtag str
|
|||
}
|
||||
|
||||
tags := make(map[string]string)
|
||||
timestamps := make(map[string]time.Time)
|
||||
|
||||
for _, nodeType := range []string{"branches", "tags"} {
|
||||
var nodes map[string]struct {
|
||||
Node string
|
||||
}
|
||||
var nodes map[string]bitbucketNode
|
||||
if _, err := c.getJSON(expand("https://api.bitbucket.org/1.0/repositories/{owner}/{repo}/{0}", match, nodeType), &nodes); err != nil {
|
||||
return nil, err
|
||||
}
|
||||
for t, n := range nodes {
|
||||
tags[t] = n.Node
|
||||
const timeFormat = "2006-01-02 15:04:05Z07:00"
|
||||
committed, err := time.Parse(timeFormat, n.Timestamp)
|
||||
if err != nil {
|
||||
log.Println("error parsing timestamp:", n.Timestamp)
|
||||
continue
|
||||
}
|
||||
timestamps[t] = committed
|
||||
}
|
||||
}
|
||||
|
||||
var err error
|
||||
match["tag"], match["commit"], err = bestTag(tags, defaultTags[match["vcs"]])
|
||||
tag, commit, err := bestTag(tags, defaultTags[match["vcs"]])
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
match["tag"] = tag
|
||||
match["commit"] = commit
|
||||
etag := expand("{vcs}-{commit}", match)
|
||||
if etag == savedEtag {
|
||||
return nil, ErrNotModified
|
||||
return nil, NotModifiedError{Since: timestamps[tag]}
|
||||
}
|
||||
|
||||
if repo == nil {
|
||||
|
|
105
gosrc/github.go
105
gosrc/github.go
|
@ -38,6 +38,15 @@ var (
|
|||
ownerRepoPat = regexp.MustCompile(`^https://api.github.com/repos/([^/]+)/([^/]+)/`)
|
||||
)
|
||||
|
||||
type githubCommit struct {
|
||||
ID string `json:"sha"`
|
||||
Commit struct {
|
||||
Committer struct {
|
||||
Date time.Time `json:"date"`
|
||||
} `json:"committer"`
|
||||
} `json:"commit"`
|
||||
}
|
||||
|
||||
func gitHubError(resp *http.Response) error {
|
||||
var e struct {
|
||||
Message string `json:"message"`
|
||||
|
@ -52,53 +61,34 @@ func getGitHubDir(client *http.Client, match map[string]string, savedEtag string
|
|||
|
||||
c := &httpClient{client: client, errFn: gitHubError}
|
||||
|
||||
type refJSON struct {
|
||||
Object struct {
|
||||
Type string
|
||||
Sha string
|
||||
URL string
|
||||
}
|
||||
Ref string
|
||||
URL string
|
||||
var repo struct {
|
||||
Fork bool `json:"fork"`
|
||||
Stars int `json:"stargazers_count"`
|
||||
CreatedAt time.Time `json:"created_at"`
|
||||
PushedAt time.Time `json:"pushed_at"`
|
||||
}
|
||||
var refs []*refJSON
|
||||
|
||||
resp, err := c.getJSON(expand("https://api.github.com/repos/{owner}/{repo}/git/refs", match), &refs)
|
||||
if err != nil {
|
||||
if _, err := c.getJSON(expand("https://api.github.com/repos/{owner}/{repo}", match), &repo); err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
// If the response contains a Link header, then fallback to requesting "master" and "go1" by name.
|
||||
if resp.Header.Get("Link") != "" {
|
||||
var masterRef refJSON
|
||||
if _, err := c.getJSON(expand("https://api.github.com/repos/{owner}/{repo}/git/refs/heads/master", match), &masterRef); err == nil {
|
||||
refs = append(refs, &masterRef)
|
||||
}
|
||||
|
||||
var go1Ref refJSON
|
||||
if _, err := c.getJSON(expand("https://api.github.com/repos/{owner}/{repo}/git/refs/tags/go1", match), &go1Ref); err == nil {
|
||||
refs = append(refs, &go1Ref)
|
||||
}
|
||||
var commits []*githubCommit
|
||||
url := expand("https://api.github.com/repos/{owner}/{repo}/commits", match)
|
||||
url += fmt.Sprintf("?since=%s", repo.CreatedAt.Format(time.RFC3339))
|
||||
if match["dir"] != "" {
|
||||
url += fmt.Sprintf("&path=%s", match["dir"])
|
||||
}
|
||||
|
||||
tags := make(map[string]string)
|
||||
for _, ref := range refs {
|
||||
switch {
|
||||
case strings.HasPrefix(ref.Ref, "refs/heads/"):
|
||||
tags[ref.Ref[len("refs/heads/"):]] = ref.Object.Sha
|
||||
case strings.HasPrefix(ref.Ref, "refs/tags/"):
|
||||
tags[ref.Ref[len("refs/tags/"):]] = ref.Object.Sha
|
||||
}
|
||||
}
|
||||
|
||||
var commit string
|
||||
match["tag"], commit, err = bestTag(tags, "master")
|
||||
if err != nil {
|
||||
if _, err := c.getJSON(url, &commits); err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
if commit == savedEtag {
|
||||
return nil, ErrNotModified
|
||||
if repo.Fork && isQuickFork(commits, repo.CreatedAt) {
|
||||
return nil, ErrQuickFork
|
||||
}
|
||||
if len(commits) == 0 {
|
||||
return nil, NotFoundError{Message: "package directory changed or removed"}
|
||||
}
|
||||
if commits[0].ID == savedEtag {
|
||||
return nil, NotModifiedError{Since: commits[0].Commit.Committer.Date}
|
||||
}
|
||||
|
||||
var contents []*struct {
|
||||
|
@ -108,7 +98,7 @@ func getGitHubDir(client *http.Client, match map[string]string, savedEtag string
|
|||
HTMLURL string `json:"html_url"`
|
||||
}
|
||||
|
||||
if _, err := c.getJSON(expand("https://api.github.com/repos/{owner}/{repo}/contents{dir}?ref={tag}", match), &contents); err != nil {
|
||||
if _, err := c.getJSON(expand("https://api.github.com/repos/{owner}/{repo}/contents{dir}", match), &contents); err != nil {
|
||||
// The GitHub content API returns array values for directories
|
||||
// and object values for files. If there's a type mismatch at
|
||||
// the beginning of the response, then assume that the path is
|
||||
|
@ -157,25 +147,14 @@ func getGitHubDir(client *http.Client, match map[string]string, savedEtag string
|
|||
|
||||
browseURL := expand("https://github.com/{owner}/{repo}", match)
|
||||
if match["dir"] != "" {
|
||||
browseURL = expand("https://github.com/{owner}/{repo}/tree/{tag}{dir}", match)
|
||||
}
|
||||
|
||||
var repo = struct {
|
||||
Fork bool `json:"fork"`
|
||||
Stars int `json:"stargazers_count"`
|
||||
CreatedAt time.Time `json:"created_at"`
|
||||
PushedAt time.Time `json:"pushed_at"`
|
||||
}{}
|
||||
|
||||
if _, err := c.getJSON(expand("https://api.github.com/repos/{owner}/{repo}", match), &repo); err != nil {
|
||||
return nil, err
|
||||
browseURL = expand("https://github.com/{owner}/{repo}/tree{dir}", match)
|
||||
}
|
||||
|
||||
isDeadEndFork := repo.Fork && repo.PushedAt.Before(repo.CreatedAt)
|
||||
|
||||
return &Directory{
|
||||
BrowseURL: browseURL,
|
||||
Etag: commit,
|
||||
Etag: commits[0].ID,
|
||||
Files: files,
|
||||
LineFmt: "%s#L%d",
|
||||
ProjectName: match["repo"],
|
||||
|
@ -189,6 +168,24 @@ func getGitHubDir(client *http.Client, match map[string]string, savedEtag string
|
|||
}, nil
|
||||
}
|
||||
|
||||
// isQuickFork reports whether the repository is a "quick fork":
|
||||
// it has fewer than 3 commits, all within a week of the repo creation, createdAt.
|
||||
func isQuickFork(commits []*githubCommit, createdAt time.Time) bool {
|
||||
if len(commits) > 2 {
|
||||
return false
|
||||
}
|
||||
oneWeekOld := createdAt.Add(7 * 24 * time.Hour)
|
||||
if oneWeekOld.After(time.Now()) {
|
||||
return false // a newborn baby of a repository
|
||||
}
|
||||
for _, commit := range commits {
|
||||
if commit.Commit.Committer.Date.After(oneWeekOld) {
|
||||
return false
|
||||
}
|
||||
}
|
||||
return true
|
||||
}
|
||||
|
||||
func getGitHubPresentation(client *http.Client, match map[string]string) (*Presentation, error) {
|
||||
c := &httpClient{client: client, header: gitHubRawHeader}
|
||||
|
||||
|
@ -310,7 +307,7 @@ func getGistDir(client *http.Client, match map[string]string, savedEtag string)
|
|||
commit := gist.History[0].Version
|
||||
|
||||
if commit == savedEtag {
|
||||
return nil, ErrNotModified
|
||||
return nil, NotModifiedError{}
|
||||
}
|
||||
|
||||
var files []*File
|
||||
|
|
|
@ -23,12 +23,6 @@ func getStandardDir(client *http.Client, importPath string, savedEtag string) (*
|
|||
|
||||
browseURL := "https://golang.org/src/" + importPath + "/"
|
||||
p, err := c.getBytes(browseURL)
|
||||
if IsNotFound(err) {
|
||||
// Fallback to Go 1.3 directory structure.
|
||||
// TODO(garyburd): Delete fallback after 1.4 is pushed to golang.org.
|
||||
browseURL = "https://golang.org/src/pkg/" + importPath + "/"
|
||||
p, err = c.getBytes(browseURL)
|
||||
}
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
@ -40,7 +34,7 @@ func getStandardDir(client *http.Client, importPath string, savedEtag string) (*
|
|||
}
|
||||
etag = strings.Trim(string(m[1]), ". ")
|
||||
if etag == savedEtag {
|
||||
return nil, ErrNotModified
|
||||
return nil, NotModifiedError{}
|
||||
}
|
||||
|
||||
var files []*File
|
||||
|
|
|
@ -75,7 +75,7 @@ func getGoogleDir(client *http.Client, match map[string]string, savedEtag string
|
|||
}
|
||||
etag = expand("{vcs}-{0}", match, string(m[1]))
|
||||
if etag == savedEtag {
|
||||
return nil, ErrNotModified
|
||||
return nil, NotModifiedError{}
|
||||
}
|
||||
|
||||
var subdirs []string
|
||||
|
|
|
@ -16,6 +16,7 @@ import (
|
|||
"path"
|
||||
"regexp"
|
||||
"strings"
|
||||
"time"
|
||||
)
|
||||
|
||||
// File represents a file.
|
||||
|
@ -112,8 +113,15 @@ func (e *RemoteError) Error() string {
|
|||
return e.err.Error()
|
||||
}
|
||||
|
||||
// ErrNotModified indicates that the directory matches the specified etag.
|
||||
var ErrNotModified = errors.New("package not modified")
|
||||
type NotModifiedError struct {
|
||||
Since time.Time
|
||||
}
|
||||
|
||||
func (e NotModifiedError) Error() string {
|
||||
return fmt.Sprintf("package not modified since %s", e.Since.Format(time.RFC1123))
|
||||
}
|
||||
|
||||
var ErrQuickFork = errors.New("package is a quick bug-fix fork")
|
||||
|
||||
var errNoMatch = errors.New("no match")
|
||||
|
||||
|
|
|
@ -119,7 +119,7 @@ func getLaunchpadDir(client *http.Client, match map[string]string, savedEtag str
|
|||
hash = m.Sum(hash[:0])
|
||||
etag := hex.EncodeToString(hash)
|
||||
if etag == savedEtag {
|
||||
return nil, ErrNotModified
|
||||
return nil, NotModifiedError{}
|
||||
}
|
||||
|
||||
return &Directory{
|
||||
|
|
|
@ -148,7 +148,7 @@ func downloadGit(schemes []string, clonePath, repo, savedEtag string) (string, s
|
|||
etag := scheme + "-" + commit
|
||||
|
||||
if etag == savedEtag {
|
||||
return "", "", ErrNotModified
|
||||
return "", "", NotModifiedError{}
|
||||
}
|
||||
|
||||
dir := filepath.Join(TempDir, repo+".git")
|
||||
|
@ -201,7 +201,7 @@ func downloadSVN(schemes []string, clonePath, repo, savedEtag string) (string, s
|
|||
|
||||
etag := scheme + "-" + revno
|
||||
if etag == savedEtag {
|
||||
return "", "", ErrNotModified
|
||||
return "", "", NotModifiedError{}
|
||||
}
|
||||
|
||||
dir := filepath.Join(TempDir, repo+".svn")
|
||||
|
|
Загрузка…
Ссылка в новой задаче