archive: Optimize ChangesDirs on Linux

If we tear through a few layers of abstraction, we can get at the inodes
contained in a directory without having to stat all the files. This
allows us to eliminate identical files much earlier in the changelist
generation process.

Signed-off-by: Burke Libbey <burke@libbey.me>
This commit is contained in:
Burke Libbey 2015-04-14 15:28:54 -04:00
Родитель aebeefa886
Коммит 45c45a2c9a
Не найден ключ, соответствующий данной подписи
Идентификатор ключа GPG: E893DEF914F22410
4 изменённых файлов: 376 добавлений и 64 удалений

Просмотреть файл

@ -4,6 +4,7 @@ import (
"bytes"
"fmt"
"io"
"io/ioutil"
"os"
"path/filepath"
"sort"
@ -277,78 +278,23 @@ func newRootFileInfo() *FileInfo {
return root
}
func collectFileInfo(sourceDir string) (*FileInfo, error) {
root := newRootFileInfo()
err := filepath.Walk(sourceDir, func(path string, f os.FileInfo, err error) error {
if err != nil {
return err
}
// Rebase path
relPath, err := filepath.Rel(sourceDir, path)
if err != nil {
return err
}
relPath = filepath.Join("/", relPath)
if relPath == "/" {
return nil
}
parent := root.LookUp(filepath.Dir(relPath))
if parent == nil {
return fmt.Errorf("collectFileInfo: Unexpectedly no parent for %s", relPath)
}
info := &FileInfo{
name: filepath.Base(relPath),
children: make(map[string]*FileInfo),
parent: parent,
}
s, err := system.Lstat(path)
if err != nil {
return err
}
info.stat = s
info.capability, _ = system.Lgetxattr(path, "security.capability")
parent.children[info.name] = info
return nil
})
if err != nil {
return nil, err
}
return root, nil
}
// ChangesDirs compares two directories and generates an array of Change objects describing the changes.
// If oldDir is "", then all files in newDir will be Add-Changes.
func ChangesDirs(newDir, oldDir string) ([]Change, error) {
var (
oldRoot, newRoot *FileInfo
err1, err2 error
errs = make(chan error, 2)
)
go func() {
if oldDir != "" {
oldRoot, err1 = collectFileInfo(oldDir)
}
errs <- err1
}()
go func() {
newRoot, err2 = collectFileInfo(newDir)
errs <- err2
}()
// block until both routines have returned
for i := 0; i < 2; i++ {
if err := <-errs; err != nil {
if oldDir == "" {
emptyDir, err := ioutil.TempDir("", "empty")
if err != nil {
return nil, err
}
defer os.Remove(emptyDir)
oldDir = emptyDir
}
oldRoot, newRoot, err := collectFileInfoForChanges(oldDir, newDir)
if err != nil {
return nil, err
}
return newRoot.Changes(oldRoot), nil

Просмотреть файл

@ -0,0 +1,276 @@
package archive
import (
"bytes"
"fmt"
"os"
"path/filepath"
"sort"
"syscall"
"unsafe"
"github.com/docker/docker/pkg/system"
)
// walker is used to implement collectFileInfoForChanges on linux. Where this
// method in general returns the entire contents of two directory trees, we
// optimize some FS calls out on linux. In particular, we take advantage of the
// fact that getdents(2) returns the inode of each file in the directory being
// walked, which, when walking two trees in parallel to generate a list of
// changes, can be used to prune subtrees without ever having to lstat(2) them
// directly. Eliminating stat calls in this way can save up to seconds on large
// images.
type walker struct {
dir1 string
dir2 string
root1 *FileInfo
root2 *FileInfo
}
// collectFileInfoForChanges returns a complete representation of the trees
// rooted at dir1 and dir2, with one important exception: any subtree or
// leaf where the inode and device numbers are an exact match between dir1
// and dir2 will be pruned from the results. This method is *only* to be used
// to generating a list of changes between the two directories, as it does not
// reflect the full contents.
func collectFileInfoForChanges(dir1, dir2 string) (*FileInfo, *FileInfo, error) {
w := &walker{
dir1: dir1,
dir2: dir2,
root1: newRootFileInfo(),
root2: newRootFileInfo(),
}
i1, err := os.Lstat(w.dir1)
if err != nil {
return nil, nil, err
}
i2, err := os.Lstat(w.dir2)
if err != nil {
return nil, nil, err
}
if err := w.walk("/", i1, i2); err != nil {
return nil, nil, err
}
return w.root1, w.root2, nil
}
// Given a FileInfo, its path info, and a reference to the root of the tree
// being constructed, register this file with the tree.
func walkchunk(path string, fi os.FileInfo, dir string, root *FileInfo) error {
if fi == nil {
return nil
}
parent := root.LookUp(filepath.Dir(path))
if parent == nil {
return fmt.Errorf("collectFileInfoForChanges: Unexpectedly no parent for %s", path)
}
info := &FileInfo{
name: filepath.Base(path),
children: make(map[string]*FileInfo),
parent: parent,
}
cpath := filepath.Join(dir, path)
stat, err := system.FromStatT(fi.Sys().(*syscall.Stat_t))
if err != nil {
return err
}
info.stat = stat
info.capability, _ = system.Lgetxattr(cpath, "security.capability") // lgetxattr(2): fs access
parent.children[info.name] = info
return nil
}
// Walk a subtree rooted at the same path in both trees being iterated. For
// example, /docker/overlay/1234/a/b/c/d and /docker/overlay/8888/a/b/c/d
func (w *walker) walk(path string, i1, i2 os.FileInfo) (err error) {
// Register these nodes with the return trees, unless we're still at the
// (already-created) roots:
if path != "/" {
if err := walkchunk(path, i1, w.dir1, w.root1); err != nil {
return err
}
if err := walkchunk(path, i2, w.dir2, w.root2); err != nil {
return err
}
}
is1Dir := i1 != nil && i1.IsDir()
is2Dir := i2 != nil && i2.IsDir()
// If these files are both non-existent, or leaves (non-dirs), we are done.
if !is1Dir && !is2Dir {
return nil
}
// Fetch the names of all the files contained in both directories being walked:
var names1, names2 []nameIno
if is1Dir {
names1, err = readdirnames(filepath.Join(w.dir1, path)) // getdents(2): fs access
if err != nil {
return err
}
}
if is2Dir {
names2, err = readdirnames(filepath.Join(w.dir2, path)) // getdents(2): fs access
if err != nil {
return err
}
}
// We have lists of the files contained in both parallel directories, sorted
// in the same order. Walk them in parallel, generating a unique merged list
// of all items present in either or both directories.
var names []string
ix1 := 0
ix2 := 0
for {
if ix1 >= len(names1) {
break
}
if ix2 >= len(names2) {
break
}
ni1 := names1[ix1]
ni2 := names2[ix2]
switch bytes.Compare([]byte(ni1.name), []byte(ni2.name)) {
case -1: // ni1 < ni2 -- advance ni1
// we will not encounter ni1 in names2
names = append(names, ni1.name)
ix1++
case 0: // ni1 == ni2
if ni1.ino != ni2.ino {
names = append(names, ni1.name)
}
ix1++
ix2++
case 1: // ni1 > ni2 -- advance ni2
// we will not encounter ni2 in names1
names = append(names, ni2.name)
ix2++
}
}
for ix1 < len(names1) {
names = append(names, names1[ix1].name)
ix1++
}
for ix2 < len(names2) {
names = append(names, names2[ix2].name)
ix2++
}
// For each of the names present in either or both of the directories being
// iterated, stat the name under each root, and recurse the pair of them:
for _, name := range names {
fname := filepath.Join(path, name)
var cInfo1, cInfo2 os.FileInfo
if is1Dir {
cInfo1, err = os.Lstat(filepath.Join(w.dir1, fname)) // lstat(2): fs access
if err != nil && !os.IsNotExist(err) {
return err
}
}
if is2Dir {
cInfo2, err = os.Lstat(filepath.Join(w.dir2, fname)) // lstat(2): fs access
if err != nil && !os.IsNotExist(err) {
return err
}
}
if err = w.walk(fname, cInfo1, cInfo2); err != nil {
return err
}
}
return nil
}
// {name,inode} pairs used to support the early-pruning logic of the walker type
type nameIno struct {
name string
ino uint64
}
type nameInoSlice []nameIno
func (s nameInoSlice) Len() int { return len(s) }
func (s nameInoSlice) Swap(i, j int) { s[i], s[j] = s[j], s[i] }
func (s nameInoSlice) Less(i, j int) bool { return s[i].name < s[j].name }
// readdirnames is a hacked-apart version of the Go stdlib code, exposing inode
// numbers further up the stack when reading directory contents. Unlike
// os.Readdirnames, which returns a list of filenames, this function returns a
// list of {filename,inode} pairs.
func readdirnames(dirname string) (names []nameIno, err error) {
var (
size = 100
buf = make([]byte, 4096)
nbuf int
bufp int
nb int
)
f, err := os.Open(dirname)
if err != nil {
return nil, err
}
defer f.Close()
names = make([]nameIno, 0, size) // Empty with room to grow.
for {
// Refill the buffer if necessary
if bufp >= nbuf {
bufp = 0
nbuf, err = syscall.ReadDirent(int(f.Fd()), buf) // getdents on linux
if nbuf < 0 {
nbuf = 0
}
if err != nil {
return nil, os.NewSyscallError("readdirent", err)
}
if nbuf <= 0 {
break // EOF
}
}
// Drain the buffer
nb, names = parseDirent(buf[bufp:nbuf], names)
bufp += nb
}
sl := nameInoSlice(names)
sort.Sort(sl)
return sl, nil
}
// parseDirent is a minor modification of syscall.ParseDirent (linux version)
// which returns {name,inode} pairs instead of just names.
func parseDirent(buf []byte, names []nameIno) (consumed int, newnames []nameIno) {
origlen := len(buf)
for len(buf) > 0 {
dirent := (*syscall.Dirent)(unsafe.Pointer(&buf[0]))
buf = buf[dirent.Reclen:]
if dirent.Ino == 0 { // File absent in directory.
continue
}
bytes := (*[10000]byte)(unsafe.Pointer(&dirent.Name[0]))
var name = string(bytes[0:clen(bytes[:])])
if name == "." || name == ".." { // Useless names
continue
}
names = append(names, nameIno{name, dirent.Ino})
}
return origlen - len(buf), names
}
func clen(n []byte) int {
for i := 0; i < len(n); i++ {
if n[i] == 0 {
return i
}
}
return len(n)
}

Просмотреть файл

@ -0,0 +1,84 @@
// +build !linux
package archive
import (
"fmt"
"os"
"path/filepath"
"github.com/docker/docker/pkg/system"
)
func collectFileInfoForChanges(oldDir, newDir string) (*FileInfo, *FileInfo, error) {
var (
oldRoot, newRoot *FileInfo
err1, err2 error
errs = make(chan error, 2)
)
go func() {
oldRoot, err1 = collectFileInfo(oldDir)
errs <- err1
}()
go func() {
newRoot, err2 = collectFileInfo(newDir)
errs <- err2
}()
// block until both routines have returned
for i := 0; i < 2; i++ {
if err := <-errs; err != nil {
return nil, nil, err
}
}
return oldRoot, newRoot, nil
}
func collectFileInfo(sourceDir string) (*FileInfo, error) {
root := newRootFileInfo()
err := filepath.Walk(sourceDir, func(path string, f os.FileInfo, err error) error {
if err != nil {
return err
}
// Rebase path
relPath, err := filepath.Rel(sourceDir, path)
if err != nil {
return err
}
relPath = filepath.Join("/", relPath)
if relPath == "/" {
return nil
}
parent := root.LookUp(filepath.Dir(relPath))
if parent == nil {
return fmt.Errorf("collectFileInfo: Unexpectedly no parent for %s", relPath)
}
info := &FileInfo{
name: filepath.Base(relPath),
children: make(map[string]*FileInfo),
parent: parent,
}
s, err := system.Lstat(path)
if err != nil {
return err
}
info.stat = s
info.capability, _ = system.Lgetxattr(path, "security.capability")
parent.children[info.name] = info
return nil
})
if err != nil {
return nil, err
}
return root, nil
}

Просмотреть файл

@ -14,6 +14,12 @@ func fromStatT(s *syscall.Stat_t) (*Stat_t, error) {
mtim: s.Mtim}, nil
}
// FromStatT exists only on linux, and loads a system.Stat_t from a
// syscal.Stat_t.
func FromStatT(s *syscall.Stat_t) (*Stat_t, error) {
return fromStatT(s)
}
// Stat takes a path to a file and returns
// a system.Stat_t type pertaining to that file.
//