diff --git a/database/database.go b/database/database.go index 0d3f114..117b5df 100644 --- a/database/database.go +++ b/database/database.go @@ -170,12 +170,6 @@ var putScript = redis.NewScript(0, ` redis.call('SREM', 'index:' .. term, id) elseif x == 2 then redis.call('SADD', 'index:' .. term, id) - if string.sub(term, 1, 7) == 'import:' then - local import = string.sub(term, 8) - if redis.call('HEXISTS', 'ids', import) == 0 and redis.call('SISMEMBER', 'badCrawl', import) == 0 then - redis.call('SADD', 'newCrawl', import) - end - end end end @@ -190,6 +184,15 @@ var putScript = redis.NewScript(0, ` return redis.call('HMSET', 'pkg:' .. id, 'path', path, 'synopsis', synopsis, 'score', score, 'gob', gob, 'terms', terms, 'etag', etag, 'kind', kind) `) +var addCrawlScript = redis.NewScript(0, ` + for i=1,#ARGV do + local pkg = ARGV[i] + if redis.call('HEXISTS', 'ids', pkg) == 0 and redis.call('SISMEMBER', 'badCrawl', pkg) == 0 then + redis.call('SADD', 'newCrawl', pkg) + end + end +`) + // Put adds the package documentation to the database. func (db *Database) Put(pdoc *doc.Package, nextCrawl time.Time) error { c := db.Pool.Get() @@ -204,7 +207,7 @@ func (db *Database) Put(pdoc *doc.Package, nextCrawl time.Time) error { } // Truncate large documents. - if gobBuf.Len() > 700000 { + if gobBuf.Len() > 200000 { pdocNew := *pdoc pdoc = &pdocNew pdoc.Truncated = true @@ -236,7 +239,45 @@ func (db *Database) Put(pdoc *doc.Package, nextCrawl time.Time) error { if !nextCrawl.IsZero() { t = nextCrawl.Unix() } + _, err = putScript.Do(c, pdoc.ImportPath, pdoc.Synopsis, score, gobBytes, strings.Join(terms, " "), pdoc.Etag, kind, t) + if err != nil { + return err + } + + if nextCrawl.IsZero() { + // Skip crawling related packages if this is not a full save. + return nil + } + + paths := make(map[string]bool) + for _, p := range pdoc.Imports { + if doc.IsValidRemotePath(p) { + paths[p] = true + } + } + for _, p := range pdoc.TestImports { + if doc.IsValidRemotePath(p) { + paths[p] = true + } + } + for _, p := range pdoc.XTestImports { + if doc.IsValidRemotePath(p) { + paths[p] = true + } + } + if pdoc.ImportPath != pdoc.ProjectRoot && pdoc.ProjectRoot != "" { + paths[pdoc.ProjectRoot] = true + } + for _, p := range pdoc.Subdirectories { + paths[pdoc.ImportPath+"/"+p] = true + } + + args := make([]interface{}, 0, len(paths)) + for p := range paths { + args = append(args, p) + } + _, err = addCrawlScript.Do(c, args...) return err } @@ -456,7 +497,6 @@ var deleteScript = redis.NewScript(0, ` end redis.call('ZREM', 'nextCrawl', id) - redis.call('SREM', 'badCrawl', path) redis.call('SREM', 'newCrawl', path) redis.call('ZREM', 'popular', id) redis.call('DEL', 'pkg:' .. id) @@ -670,6 +710,7 @@ type PackageInfo struct { Pkgs []Package Score float64 Kind string + Size int } // Do executes function f for each document in the database. @@ -681,18 +722,20 @@ func (db *Database) Do(f func(*PackageInfo) error) error { return err } for _, key := range keys { - values, err := redis.Values(c.Do("HMGET", key, "gob", "score", "kind", "path")) + values, err := redis.Values(c.Do("HMGET", key, "gob", "score", "kind", "path", "terms", "synopis")) if err != nil { return err } var ( - pi PackageInfo - p []byte - path string + pi PackageInfo + p []byte + path string + terms string + synopsis string ) - if _, err := redis.Scan(values, &p, &pi.Score, &pi.Kind, &path); err != nil { + if _, err := redis.Scan(values, &p, &pi.Score, &pi.Kind, &path, &terms, &synopsis); err != nil { return err } @@ -700,6 +743,8 @@ func (db *Database) Do(f func(*PackageInfo) error) error { continue } + pi.Size = len(path) + len(p) + len(terms) + len(synopsis) + p, err = snappy.Decode(nil, p) if err != nil { return fmt.Errorf("snappy decoding %s: %v", path, err) @@ -810,7 +855,7 @@ func (db *Database) GetGob(key string, value interface{}) error { return gob.NewDecoder(bytes.NewReader(p)).Decode(value) } -var incrementPopularScore = redis.NewScript(0, ` +var incrementPopularScoreScript = redis.NewScript(0, ` local path = ARGV[1] local n = ARGV[2] local t = ARGV[3] @@ -832,20 +877,21 @@ var incrementPopularScore = redis.NewScript(0, ` const popularHalfLife = time.Hour * 24 * 7 -func scaledTime(t time.Time) float64 { - const lambda = math.Ln2 / float64(popularHalfLife) - return lambda * float64(t.Sub(time.Unix(1257894000, 0))) -} - -func (db *Database) IncrementPopularScore(path string) error { +func (db *Database) incrementPopularScoreInternal(path string, delta float64, t time.Time) error { // nt = n0 * math.Exp(-lambda * t) // lambda = math.Ln2 / thalf c := db.Pool.Get() defer c.Close() - _, err := incrementPopularScore.Do(c, path, 1, scaledTime(time.Now())) + const lambda = math.Ln2 / float64(popularHalfLife) + scaledTime := lambda * float64(t.Sub(time.Unix(1257894000, 0))) + _, err := incrementPopularScoreScript.Do(c, path, delta, scaledTime) return err } +func (db *Database) IncrementPopularScore(path string) error { + return db.incrementPopularScoreInternal(path, 1, time.Now()) +} + var popularScript = redis.NewScript(0, ` local stop = ARGV[1] local ids = redis.call('ZREVRANGE', 'popular', '0', stop) @@ -892,26 +938,59 @@ func (db *Database) PopularWithScores() ([]Package, error) { return pkgs, err } -func (db *Database) GetNewCrawl() (string, error) { +func (db *Database) PopNewCrawl() (string, bool, error) { c := db.Pool.Get() defer c.Close() - v, err := redis.String(c.Do("SRANDMEMBER", "newCrawl")) - if err == redis.ErrNil { + + var subdirs []Package + + path, err := redis.String(c.Do("SPOP", "newCrawl")) + switch { + case err == redis.ErrNil: err = nil + path = "" + case err == nil: + subdirs, err = db.getSubdirs(c, path, nil) } - return v, err + return path, len(subdirs) > 0, err } -var setBadCrawlScript = redis.NewScript(0, ` - local path = ARGV[1] - if redis.call('SREM', 'newCrawl', path) == 1 then - redis.call('SADD', 'badCrawl', path) - end -`) - -func (db *Database) SetBadCrawl(path string) error { +func (db *Database) AddBadCrawl(path string) error { c := db.Pool.Get() defer c.Close() - _, err := setBadCrawlScript.Do(c, path) + _, err := c.Do("SADD", "badCrawl", path) return err } + +var incrementCounterScript = redis.NewScript(0, ` + local key = 'counter:' .. ARGV[1] + local n = tonumber(ARGV[2]) + local t = tonumber(ARGV[3]) + local exp = tonumber(ARGV[4]) + + local counter = redis.call('GET', key) + if counter then + counter = cjson.decode(counter) + n = n + counter.n * math.exp(counter.t - t) + end + + redis.call('SET', key, cjson.encode({n = n; t = t})) + redis.call('EXPIRE', key, exp) + return tostring(n) +`) + +const counterHalflife = time.Hour + +func (db *Database) incrementCounterInternal(key string, delta float64, t time.Time) (float64, error) { + // nt = n0 * math.Exp(-lambda * t) + // lambda = math.Ln2 / thalf + c := db.Pool.Get() + defer c.Close() + const lambda = math.Ln2 / float64(counterHalflife) + scaledTime := lambda * float64(t.Sub(time.Unix(1257894000, 0))) + return redis.Float64(incrementCounterScript.Do(c, key, delta, scaledTime, (4*counterHalflife)/time.Second)) +} + +func (db *Database) IncrementCounter(key string, delta float64) (float64, error) { + return db.incrementCounterInternal(key, delta, time.Now()) +} diff --git a/database/database_test.go b/database/database_test.go index 669f456..5dcb46a 100644 --- a/database/database_test.go +++ b/database/database_test.go @@ -193,6 +193,8 @@ func TestPutGet(t *testing.T) { } } +const epsilon = 0.000001 + func TestPopular(t *testing.T) { db := newDB(t) defer closeDB(db) @@ -207,7 +209,7 @@ func TestPopular(t *testing.T) { for id := 12; id >= 0; id-- { path := "github.com/user/repo/p" + strconv.Itoa(id) c.Do("HSET", "ids", path, id) - _, err := incrementPopularScore.Do(c, path, score, scaledTime(now)) + err := db.incrementPopularScoreInternal(path, score, now) if err != nil { t.Fatal(err) } @@ -227,8 +229,39 @@ func TestPopular(t *testing.T) { } for i := 3; i < len(values); i += 2 { s, _ := redis.Float64(values[i], nil) - if math.Abs(score-s)/score > 0.0001 { + if math.Abs(score-s)/score > epsilon { t.Errorf("Bad score, score[1]=%g, score[%d]=%g", score, i, s) } } } + +func TestCounter(t *testing.T) { + db := newDB(t) + defer closeDB(db) + + const key = "127.0.0.1" + + now := time.Now() + n, err := db.incrementCounterInternal(key, 1, now) + if err != nil { + t.Fatal(err) + } + if math.Abs(n-1.0) > epsilon { + t.Errorf("1: got n=%g, want 1", n) + } + n, err = db.incrementCounterInternal(key, 1, now) + if err != nil { + t.Fatal(err) + } + if math.Abs(n-2.0)/2.0 > epsilon { + t.Errorf("2: got n=%g, want 2", n) + } + now = now.Add(counterHalflife) + n, err = db.incrementCounterInternal(key, 1, now) + if err != nil { + t.Fatal(err) + } + if math.Abs(n-2.0)/2.0 > epsilon { + t.Errorf("3: got n=%g, want 2", n) + } +} diff --git a/doc/bitbucket.go b/doc/bitbucket.go index 640f70d..0d84352 100644 --- a/doc/bitbucket.go +++ b/doc/bitbucket.go @@ -63,18 +63,19 @@ func getBitbucketDoc(client *http.Client, match map[string]string, savedEtag str return nil, ErrNotModified } - var directory struct { - Files []struct { + var contents struct { + Directories []string + Files []struct { Path string } } - if err := httpGetJSON(client, expand("https://api.bitbucket.org/1.0/repositories/{owner}/{repo}/src/{tag}{dir}/", match), nil, &directory); err != nil { + if err := httpGetJSON(client, expand("https://api.bitbucket.org/1.0/repositories/{owner}/{repo}/src/{tag}{dir}/", match), nil, &contents); err != nil { return nil, err } var files []*source - for _, f := range directory.Files { + for _, f := range contents.Files { _, name := path.Split(f.Path) if isDocFile(name) { files = append(files, &source{ @@ -91,14 +92,15 @@ func getBitbucketDoc(client *http.Client, match map[string]string, savedEtag str b := builder{ pdoc: &Package{ - LineFmt: "%s#cl-%d", - ImportPath: match["originalImportPath"], - ProjectRoot: expand("bitbucket.org/{owner}/{repo}", match), - ProjectName: match["repo"], - ProjectURL: expand("https://bitbucket.org/{owner}/{repo}/", match), - BrowseURL: expand("https://bitbucket.org/{owner}/{repo}/src/{tag}{dir}", match), - Etag: etag, - VCS: match["vcs"], + LineFmt: "%s#cl-%d", + ImportPath: match["originalImportPath"], + ProjectRoot: expand("bitbucket.org/{owner}/{repo}", match), + ProjectName: match["repo"], + ProjectURL: expand("https://bitbucket.org/{owner}/{repo}/", match), + BrowseURL: expand("https://bitbucket.org/{owner}/{repo}/src/{tag}{dir}", match), + Etag: etag, + VCS: match["vcs"], + Subdirectories: contents.Directories, }, } diff --git a/doc/builder.go b/doc/builder.go index e63ec04..c114ceb 100644 --- a/doc/builder.go +++ b/doc/builder.go @@ -16,6 +16,7 @@ package doc import ( "bytes" + "errors" "go/ast" "go/build" "go/doc" @@ -327,19 +328,22 @@ var packageNamePats = []*regexp.Regexp{ func simpleImporter(imports map[string]*ast.Object, path string) (*ast.Object, error) { pkg := imports[path] - if pkg == nil { - // Guess the package name without importing it. - for _, pat := range packageNamePats { - m := pat.FindStringSubmatch(path) - if m != nil { - pkg = ast.NewObj(ast.Pkg, m[1]) - pkg.Data = ast.NewScope(nil) - imports[path] = pkg - break - } + if pkg != nil { + return pkg, nil + } + + // Guess the package name without importing it. + for _, pat := range packageNamePats { + m := pat.FindStringSubmatch(path) + if m != nil { + pkg = ast.NewObj(ast.Pkg, m[1]) + pkg.Data = ast.NewScope(nil) + imports[path] = pkg + return pkg, nil } } - return pkg, nil + + return nil, errors.New("package not found") } type File struct { diff --git a/doc/github.go b/doc/github.go index 8f12a40..e689365 100644 --- a/doc/github.go +++ b/doc/github.go @@ -17,7 +17,6 @@ package doc import ( "net/http" "net/url" - "path" "regexp" "strings" "time" @@ -73,49 +72,91 @@ func getGitHubDoc(client *http.Client, match map[string]string, savedEtag string return nil, ErrNotModified } - var tree struct { - Tree []struct { - Url string - Path string - Type string - } - Url string + var contents []*struct { + Type string + Name string + Git_URL string + HTML_URL string } - err = httpGetJSON(client, expand("https://api.github.com/repos/{owner}/{repo}/git/trees/{tag}?recursive=1&{cred}", match), nil, &tree) + err = httpGetJSON(client, expand("https://api.github.com/repos/{owner}/{repo}/contents{dir}?ref={tag}&{cred}", match), nil, &contents) if err != nil { return nil, err } - // Because Github API URLs are case-insensitive, we need to check that the - // userRepo returned from Github matches the one that we are requesting. - if !strings.HasPrefix(tree.Url, expand("https://api.github.com/repos/{owner}/{repo}/", match)) { + if len(contents) == 0 { + return nil, NotFoundError{"No files in directory."} + } + + // Because Github API URLs are case-insensitive, we check that the owner + // and repo returned from Github matches the one that we are requesting. + if !strings.HasPrefix(contents[0].Git_URL, expand("https://api.github.com/repos/{owner}/{repo}/", match)) { return nil, NotFoundError{"Github import path has incorrect case."} } - inTree := false - dirPrefix := match["dir"] - if dirPrefix != "" { - dirPrefix = dirPrefix[1:] + "/" - } var files []*source - for _, node := range tree.Tree { - if node.Type != "blob" || !strings.HasPrefix(node.Path, dirPrefix) { - continue - } - inTree = true - if d, f := path.Split(node.Path); d == dirPrefix && isDocFile(f) { + var subdirs []string + + for _, item := range contents { + switch { + case item.Type == "dir": + if isValidPathElement(item.Name) { + subdirs = append(subdirs, item.Name) + } + case isDocFile(item.Name): files = append(files, &source{ - name: f, - browseURL: expand("https://github.com/{owner}/{repo}/blob/{tag}/{0}", match, node.Path), - rawURL: node.Url + "?" + gitHubCred, + name: item.Name, + browseURL: item.HTML_URL, + rawURL: item.Git_URL + "?" + gitHubCred, }) } } - if !inTree { - return nil, NotFoundError{"Directory tree does not contain Go files."} - } + /* + var tree struct { + Tree []struct { + Url string + Path string + Type string + } + Url string + } + + err = httpGetJSON(client, expand("https://api.github.com/repos/{owner}/{repo}/git/trees/{tag}?recursive=1&{cred}", match), nil, &tree) + if err != nil { + return nil, err + } + + // Because Github API URLs are case-insensitive, we need to check that the + // userRepo returned from Github matches the one that we are requesting. + if !strings.HasPrefix(tree.Url, expand("https://api.github.com/repos/{owner}/{repo}/", match)) { + return nil, NotFoundError{"Github import path has incorrect case."} + } + + inTree := false + dirPrefix := match["dir"] + if dirPrefix != "" { + dirPrefix = dirPrefix[1:] + "/" + } + var files []*source + for _, node := range tree.Tree { + if node.Type != "blob" || !strings.HasPrefix(node.Path, dirPrefix) { + continue + } + inTree = true + if d, f := path.Split(node.Path); d == dirPrefix && isDocFile(f) { + files = append(files, &source{ + name: f, + browseURL: expand("https://github.com/{owner}/{repo}/blob/{tag}/{0}", match, node.Path), + rawURL: node.Url + "?" + gitHubCred, + }) + } + } + + if !inTree { + return nil, NotFoundError{"Directory tree does not contain Go files."} + } + */ if err := fetchFiles(client, files, gitHubRawHeader); err != nil { return nil, err @@ -128,14 +169,15 @@ func getGitHubDoc(client *http.Client, match map[string]string, savedEtag string b := &builder{ pdoc: &Package{ - LineFmt: "%s#L%d", - ImportPath: match["originalImportPath"], - ProjectRoot: expand("github.com/{owner}/{repo}", match), - ProjectName: match["repo"], - ProjectURL: expand("https://github.com/{owner}/{repo}", match), - BrowseURL: browseURL, - Etag: commit, - VCS: "git", + LineFmt: "%s#L%d", + ImportPath: match["originalImportPath"], + ProjectRoot: expand("github.com/{owner}/{repo}", match), + ProjectName: match["repo"], + ProjectURL: expand("https://github.com/{owner}/{repo}", match), + BrowseURL: browseURL, + Etag: commit, + VCS: "git", + Subdirectories: subdirs, }, } diff --git a/doc/google.go b/doc/google.go index 5e70227..f6ccde0 100644 --- a/doc/google.go +++ b/doc/google.go @@ -26,7 +26,7 @@ var ( googleRepoRe = regexp.MustCompile(`id="checkoutcmd">(hg|git|svn)`) googleRevisionRe = regexp.MustCompile(`