From 1c4202a6142d238d41f10deff1f0548f7591350b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?J=C3=A9r=C3=B4me=20Petazzoni?= Date: Wed, 30 Apr 2014 18:00:42 -0700 Subject: [PATCH 1/6] Mount /proc and /sys read-only, except in privileged containers. MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit It has been pointed out that some files in /proc and /sys can be used to break out of containers. However, if those filesystems are mounted read-only, most of the known exploits are mitigated, since they rely on writing some file in those filesystems. This does not replace security modules (like SELinux or AppArmor), it is just another layer of security. Likewise, it doesn't mean that the other mitigations (shadowing parts of /proc or /sys with bind mounts) are useless. Those measures are still useful. As such, the shadowing of /proc/kcore is still enabled with both LXC and native drivers. Special care has to be taken with /proc/1/attr, which still needs to be mounted read-write in order to enable the AppArmor profile. It is bind-mounted from a private read-write mount of procfs. All that enforcement is done in dockerinit. The code doing the real work is in libcontainer. The init function for the LXC driver calls the function from libcontainer to avoid code duplication. Docker-DCO-1.1-Signed-off-by: Jérôme Petazzoni (github: jpetazzo) --- daemon/execdriver/lxc/driver.go | 5 ++ daemon/execdriver/lxc/lxc_template.go | 23 +++-- daemon/execdriver/native/create.go | 2 - integration-cli/docker_cli_run_test.go | 38 ++++++-- pkg/libcontainer/mount/init.go | 12 +-- pkg/libcontainer/nsinit/init.go | 14 ++- .../security/restrict/restrict.go | 86 ++++++++++++------- 7 files changed, 113 insertions(+), 67 deletions(-) diff --git a/daemon/execdriver/lxc/driver.go b/daemon/execdriver/lxc/driver.go index 6ee7f3c1dd..3fe44202ac 100644 --- a/daemon/execdriver/lxc/driver.go +++ b/daemon/execdriver/lxc/driver.go @@ -5,6 +5,7 @@ import ( "github.com/dotcloud/docker/daemon/execdriver" "github.com/dotcloud/docker/pkg/cgroups" "github.com/dotcloud/docker/pkg/label" + "github.com/dotcloud/docker/pkg/libcontainer/security/restrict" "github.com/dotcloud/docker/pkg/system" "github.com/dotcloud/docker/utils" "io/ioutil" @@ -35,6 +36,10 @@ func init() { return err } + if err := restrict.Restrict("/", "/empty"); err != nil { + return err + } + if err := setupCapabilities(args); err != nil { return err } diff --git a/daemon/execdriver/lxc/lxc_template.go b/daemon/execdriver/lxc/lxc_template.go index bc94e7a19d..03d32e72b5 100644 --- a/daemon/execdriver/lxc/lxc_template.go +++ b/daemon/execdriver/lxc/lxc_template.go @@ -82,15 +82,12 @@ lxc.pivotdir = lxc_putold # NOTICE: These mounts must be applied within the namespace -# WARNING: procfs is a known attack vector and should probably be disabled -# if your userspace allows it. eg. see http://blog.zx2c4.com/749 +# WARNING: mounting procfs and/or sysfs read-write is a known attack vector. +# See e.g. http://blog.zx2c4.com/749 and http://bit.ly/T9CkqJ +# We mount them read-write here, but later, dockerinit will call the Restrict() function to remount them read-only. +# We cannot mount them directly read-only, because that would prevent loading AppArmor profiles. lxc.mount.entry = proc {{escapeFstabSpaces $ROOTFS}}/proc proc nosuid,nodev,noexec 0 0 - -# WARNING: sysfs is a known attack vector and should probably be disabled -# if your userspace allows it. eg. see http://bit.ly/T9CkqJ -{{if .Privileged}} lxc.mount.entry = sysfs {{escapeFstabSpaces $ROOTFS}}/sys sysfs nosuid,nodev,noexec 0 0 -{{end}} {{if .Tty}} lxc.mount.entry = {{.Console}} {{escapeFstabSpaces $ROOTFS}}/dev/console none bind,rw 0 0 @@ -111,14 +108,14 @@ lxc.mount.entry = {{$value.Source}} {{escapeFstabSpaces $ROOTFS}}/{{escapeFstabS {{if .AppArmor}} lxc.aa_profile = unconfined {{else}} -# not unconfined +# Let AppArmor normal confinement take place (i.e., not unconfined) {{end}} {{else}} -# restrict access to proc -lxc.mount.entry = {{.RestrictionSource}} {{escapeFstabSpaces $ROOTFS}}/proc/sys none bind,ro 0 0 -lxc.mount.entry = {{.RestrictionSource}} {{escapeFstabSpaces $ROOTFS}}/proc/irq none bind,ro 0 0 -lxc.mount.entry = {{.RestrictionSource}} {{escapeFstabSpaces $ROOTFS}}/proc/acpi none bind,ro 0 0 -lxc.mount.entry = {{escapeFstabSpaces $ROOTFS}}/dev/null {{escapeFstabSpaces $ROOTFS}}/proc/sysrq-trigger none bind,ro 0 0 +# Restrict access to some stuff in /proc. Note that /proc is already mounted +# read-only, so we don't need to bother about things that are just dangerous +# to write to (like sysrq-trigger). Also, recent kernels won't let a container +# peek into /proc/kcore, but let's cater for people who might run Docker on +# older kernels. Just in case. lxc.mount.entry = {{escapeFstabSpaces $ROOTFS}}/dev/null {{escapeFstabSpaces $ROOTFS}}/proc/kcore none bind,ro 0 0 {{end}} diff --git a/daemon/execdriver/native/create.go b/daemon/execdriver/native/create.go index 00e6fc4b26..6f663f916e 100644 --- a/daemon/execdriver/native/create.go +++ b/daemon/execdriver/native/create.go @@ -84,8 +84,6 @@ func (d *driver) setPrivileged(container *libcontainer.Container) error { } container.Cgroups.DeviceAccess = true - // add sysfs as a mount for privileged containers - container.Mounts = append(container.Mounts, libcontainer.Mount{Type: "sysfs"}) delete(container.Context, "restriction_path") if apparmor.IsEnabled() { diff --git a/integration-cli/docker_cli_run_test.go b/integration-cli/docker_cli_run_test.go index 83867267ae..b9737feeea 100644 --- a/integration-cli/docker_cli_run_test.go +++ b/integration-cli/docker_cli_run_test.go @@ -725,24 +725,46 @@ func TestUnPrivilegedCannotMount(t *testing.T) { logDone("run - test un-privileged cannot mount") } -func TestSysNotAvaliableInNonPrivilegedContainers(t *testing.T) { - cmd := exec.Command(dockerBinary, "run", "busybox", "ls", "/sys/kernel") +func TestSysNotWritableInNonPrivilegedContainers(t *testing.T) { + cmd := exec.Command(dockerBinary, "run", "busybox", "touch", "/sys/kernel/profiling") if code, err := runCommand(cmd); err == nil || code == 0 { - t.Fatal("sys should not be available in a non privileged container") + t.Fatal("sys should not be writable in a non privileged container") } deleteAllContainers() - logDone("run - sys not avaliable in non privileged container") + logDone("run - sys not writable in non privileged container") } -func TestSysAvaliableInPrivilegedContainers(t *testing.T) { - cmd := exec.Command(dockerBinary, "run", "--privileged", "busybox", "ls", "/sys/kernel") +func TestSysWritableInPrivilegedContainers(t *testing.T) { + cmd := exec.Command(dockerBinary, "run", "--privileged", "busybox", "touch", "/sys/kernel/profiling") if code, err := runCommand(cmd); err != nil || code != 0 { - t.Fatalf("sys should be available in privileged container") + t.Fatalf("sys should be writable in privileged container") } deleteAllContainers() - logDone("run - sys avaliable in privileged container") + logDone("run - sys writable in privileged container") +} + +func TestProcNotWritableInNonPrivilegedContainers(t *testing.T) { + cmd := exec.Command(dockerBinary, "run", "busybox", "touch", "/proc/sysrq-trigger") + if code, err := runCommand(cmd); err == nil || code == 0 { + t.Fatal("proc should not be writable in a non privileged container") + } + + deleteAllContainers() + + logDone("run - proc not writable in non privileged container") +} + +func TestProcWritableInPrivilegedContainers(t *testing.T) { + cmd := exec.Command(dockerBinary, "run", "--privileged", "busybox", "touch", "/proc/sysrq-trigger") + if code, err := runCommand(cmd); err != nil || code != 0 { + t.Fatalf("proc should be writable in privileged container") + } + + deleteAllContainers() + + logDone("run - proc writable in privileged container") } diff --git a/pkg/libcontainer/mount/init.go b/pkg/libcontainer/mount/init.go index 735970cded..cc3ce2158e 100644 --- a/pkg/libcontainer/mount/init.go +++ b/pkg/libcontainer/mount/init.go @@ -11,7 +11,6 @@ import ( "github.com/dotcloud/docker/pkg/label" "github.com/dotcloud/docker/pkg/libcontainer" "github.com/dotcloud/docker/pkg/libcontainer/mount/nodes" - "github.com/dotcloud/docker/pkg/libcontainer/security/restrict" "github.com/dotcloud/docker/pkg/system" ) @@ -51,11 +50,6 @@ func InitializeMountNamespace(rootfs, console string, container *libcontainer.Co if err := nodes.CopyN(rootfs, nodes.DefaultNodes); err != nil { return fmt.Errorf("copy dev nodes %s", err) } - if restrictionPath := container.Context["restriction_path"]; restrictionPath != "" { - if err := restrict.Restrict(rootfs, restrictionPath); err != nil { - return fmt.Errorf("restrict %s", err) - } - } if err := SetupPtmx(rootfs, console, container.Context["mount_label"]); err != nil { return err } @@ -124,10 +118,11 @@ func setupBindmounts(rootfs string, bindMounts libcontainer.Mounts) error { } // TODO: this is crappy right now and should be cleaned up with a better way of handling system and -// standard bind mounts allowing them to be more dymanic +// standard bind mounts allowing them to be more dynamic func newSystemMounts(rootfs, mountLabel string, mounts libcontainer.Mounts) []mount { systemMounts := []mount{ {source: "proc", path: filepath.Join(rootfs, "proc"), device: "proc", flags: defaultMountFlags}, + {source: "sysfs", path: filepath.Join(rootfs, "sys"), device: "sysfs", flags: defaultMountFlags}, } if len(mounts.OfType("devtmpfs")) == 1 { @@ -138,8 +133,5 @@ func newSystemMounts(rootfs, mountLabel string, mounts libcontainer.Mounts) []mo mount{source: "devpts", path: filepath.Join(rootfs, "dev", "pts"), device: "devpts", flags: syscall.MS_NOSUID | syscall.MS_NOEXEC, data: label.FormatMountLabel("newinstance,ptmxmode=0666,mode=620,gid=5", mountLabel)}, ) - if len(mounts.OfType("sysfs")) == 1 { - systemMounts = append(systemMounts, mount{source: "sysfs", path: filepath.Join(rootfs, "sys"), device: "sysfs", flags: defaultMountFlags}) - } return systemMounts } diff --git a/pkg/libcontainer/nsinit/init.go b/pkg/libcontainer/nsinit/init.go index faec12af32..bafb877cd9 100644 --- a/pkg/libcontainer/nsinit/init.go +++ b/pkg/libcontainer/nsinit/init.go @@ -16,6 +16,7 @@ import ( "github.com/dotcloud/docker/pkg/libcontainer/mount" "github.com/dotcloud/docker/pkg/libcontainer/network" "github.com/dotcloud/docker/pkg/libcontainer/security/capabilities" + "github.com/dotcloud/docker/pkg/libcontainer/security/restrict" "github.com/dotcloud/docker/pkg/libcontainer/utils" "github.com/dotcloud/docker/pkg/system" "github.com/dotcloud/docker/pkg/user" @@ -68,18 +69,25 @@ func Init(container *libcontainer.Container, uncleanRootfs, consolePath string, if err := system.Sethostname(container.Hostname); err != nil { return fmt.Errorf("sethostname %s", err) } - if err := FinalizeNamespace(container); err != nil { - return fmt.Errorf("finalize namespace %s", err) - } runtime.LockOSThread() + if restrictionPath := container.Context["restriction_path"]; restrictionPath != "" { + if err := restrict.Restrict("/", restrictionPath); err != nil { + return err + } + } + if err := apparmor.ApplyProfile(os.Getpid(), container.Context["apparmor_profile"]); err != nil { return err } if err := label.SetProcessLabel(container.Context["process_label"]); err != nil { return fmt.Errorf("set process label %s", err) } + + if err := FinalizeNamespace(container); err != nil { + return fmt.Errorf("finalize namespace %s", err) + } return system.Execv(args[0], args[0:], container.Env) } diff --git a/pkg/libcontainer/security/restrict/restrict.go b/pkg/libcontainer/security/restrict/restrict.go index 291d6ca5dc..8c08ea1806 100644 --- a/pkg/libcontainer/security/restrict/restrict.go +++ b/pkg/libcontainer/security/restrict/restrict.go @@ -9,43 +9,67 @@ import ( "github.com/dotcloud/docker/pkg/system" ) -const flags = syscall.MS_BIND | syscall.MS_REC | syscall.MS_RDONLY - -var restrictions = map[string]string{ - // dirs - "/proc/sys": "", - "/proc/irq": "", - "/proc/acpi": "", - - // files - "/proc/sysrq-trigger": "/dev/null", - "/proc/kcore": "/dev/null", +// "restrictions" are container paths (files, directories, whatever) that have to be masked. +// maskPath is a "safe" path to be mounted over maskedPath. It can take two special values: +// - if it is "", then nothing is mounted; +// - if it is "EMPTY", then an empty directory is mounted instead. +// If remountRO is true then the maskedPath is remounted read-only (regardless of whether a maskPath was used). +type restriction struct { + maskedPath string + maskPath string + remountRO bool } -// Restrict locks down access to many areas of proc -// by using the asumption that the user does not have mount caps to -// revert the changes made here -func Restrict(rootfs, empty string) error { - for dest, source := range restrictions { - dest = filepath.Join(rootfs, dest) +var restrictions = []restriction{ + {"/proc", "", true}, + {"/sys", "", true}, + {"/proc/kcore", "/dev/null", false}, +} - // we don't have a "/dev/null" for dirs so have the requester pass a dir - // for us to bind mount - switch source { - case "": - source = empty - default: - source = filepath.Join(rootfs, source) - } - if err := system.Mount(source, dest, "bind", flags, ""); err != nil { - if os.IsNotExist(err) { - continue +// This has to be called while the container still has CAP_SYS_ADMIN (to be able to perform mounts). +// However, afterwards, CAP_SYS_ADMIN should be dropped (otherwise the user will be able to revert those changes). +// "empty" should be the path to an empty directory. +func Restrict(rootfs, empty string) error { + for _, restriction := range restrictions { + dest := filepath.Join(rootfs, restriction.maskedPath) + if restriction.maskPath != "" { + var source string + if restriction.maskPath == "EMPTY" { + source = empty + } else { + source = filepath.Join(rootfs, restriction.maskPath) + } + if err := system.Mount(source, dest, "", syscall.MS_BIND, ""); err != nil { + return fmt.Errorf("unable to bind-mount %s over %s: %s", source, dest, err) } - return fmt.Errorf("unable to mount %s over %s %s", source, dest, err) } - if err := system.Mount("", dest, "bind", flags|syscall.MS_REMOUNT, ""); err != nil { - return fmt.Errorf("unable to mount %s over %s %s", source, dest, err) + if restriction.remountRO { + if err := system.Mount("", dest, "", syscall.MS_REMOUNT|syscall.MS_RDONLY, ""); err != nil { + return fmt.Errorf("unable to remount %s readonly: %s", dest, err) + } } } + + // This weird trick will allow us to mount /proc read-only, while being able to use AppArmor. + // This is because apparently, loading an AppArmor profile requires write access to /proc/1/attr. + // So we do another mount of procfs, ensure it's write-able, and bind-mount a subset of it. + tmpProcPath := filepath.Join(rootfs, ".proc") + if err := os.Mkdir(tmpProcPath, 0700); err != nil { + return fmt.Errorf("unable to create temporary proc mountpoint %s: %s", tmpProcPath, err) + } + if err := system.Mount("proc", tmpProcPath, "proc", 0, ""); err != nil { + return fmt.Errorf("unable to mount proc on temporary proc mountpoint: %s", err) + } + if err := system.Mount("proc", tmpProcPath, "", syscall.MS_REMOUNT, ""); err != nil { + return fmt.Errorf("unable to remount proc read-write: %s", err) + } + rwAttrPath := filepath.Join(rootfs, ".proc", "1", "attr") + roAttrPath := filepath.Join(rootfs, "proc", "1", "attr") + if err := system.Mount(rwAttrPath, roAttrPath, "", syscall.MS_BIND, ""); err != nil { + return fmt.Errorf("unable to bind-mount %s on %s: %s", rwAttrPath, roAttrPath, err) + } + if err := system.Unmount(tmpProcPath, 0); err != nil { + return fmt.Errorf("unable to unmount temporary proc filesystem: %s", err) + } return nil } From 83982e8b1d0cd825e1762b5540db8ae77c34f065 Mon Sep 17 00:00:00 2001 From: Michael Crosby Date: Wed, 30 Apr 2014 19:09:25 -0700 Subject: [PATCH 2/6] Update to enable cross compile Docker-DCO-1.1-Signed-off-by: Michael Crosby (github: crosbymichael) --- pkg/libcontainer/nsinit/init.go | 1 - pkg/libcontainer/security/restrict/restrict.go | 2 ++ pkg/libcontainer/security/restrict/unsupported.go | 9 +++++++++ 3 files changed, 11 insertions(+), 1 deletion(-) create mode 100644 pkg/libcontainer/security/restrict/unsupported.go diff --git a/pkg/libcontainer/nsinit/init.go b/pkg/libcontainer/nsinit/init.go index bafb877cd9..90b97a9f99 100644 --- a/pkg/libcontainer/nsinit/init.go +++ b/pkg/libcontainer/nsinit/init.go @@ -84,7 +84,6 @@ func Init(container *libcontainer.Container, uncleanRootfs, consolePath string, if err := label.SetProcessLabel(container.Context["process_label"]); err != nil { return fmt.Errorf("set process label %s", err) } - if err := FinalizeNamespace(container); err != nil { return fmt.Errorf("finalize namespace %s", err) } diff --git a/pkg/libcontainer/security/restrict/restrict.go b/pkg/libcontainer/security/restrict/restrict.go index 8c08ea1806..a9bdc4bacb 100644 --- a/pkg/libcontainer/security/restrict/restrict.go +++ b/pkg/libcontainer/security/restrict/restrict.go @@ -1,3 +1,5 @@ +// +build linux + package restrict import ( diff --git a/pkg/libcontainer/security/restrict/unsupported.go b/pkg/libcontainer/security/restrict/unsupported.go new file mode 100644 index 0000000000..6898baab3d --- /dev/null +++ b/pkg/libcontainer/security/restrict/unsupported.go @@ -0,0 +1,9 @@ +// +build !linux + +package restrict + +import "fmt" + +func Restrict(rootfs, empty string) error { + return fmt.Errorf("not supported") +} From f5139233b930e436707a65cc032aa2952edd6e4a Mon Sep 17 00:00:00 2001 From: Michael Crosby Date: Thu, 1 May 2014 10:08:18 -0700 Subject: [PATCH 3/6] Update restrictions for better handling of mounts This also cleans up some of the left over restriction paths code from before. Docker-DCO-1.1-Signed-off-by: Michael Crosby (github: crosbymichael) --- daemon/execdriver/lxc/driver.go | 60 +++++++---------- daemon/execdriver/lxc/lxc_template.go | 12 +--- daemon/execdriver/native/create.go | 4 +- daemon/execdriver/native/driver.go | 7 -- pkg/libcontainer/mount/init.go | 7 +- pkg/libcontainer/nsinit/init.go | 4 +- .../security/restrict/restrict.go | 65 ++++++------------- .../security/restrict/unsupported.go | 2 +- 8 files changed, 54 insertions(+), 107 deletions(-) diff --git a/daemon/execdriver/lxc/driver.go b/daemon/execdriver/lxc/driver.go index 3fe44202ac..92a79ff5a5 100644 --- a/daemon/execdriver/lxc/driver.go +++ b/daemon/execdriver/lxc/driver.go @@ -2,12 +2,6 @@ package lxc import ( "fmt" - "github.com/dotcloud/docker/daemon/execdriver" - "github.com/dotcloud/docker/pkg/cgroups" - "github.com/dotcloud/docker/pkg/label" - "github.com/dotcloud/docker/pkg/libcontainer/security/restrict" - "github.com/dotcloud/docker/pkg/system" - "github.com/dotcloud/docker/utils" "io/ioutil" "log" "os" @@ -18,6 +12,13 @@ import ( "strings" "syscall" "time" + + "github.com/dotcloud/docker/daemon/execdriver" + "github.com/dotcloud/docker/pkg/cgroups" + "github.com/dotcloud/docker/pkg/label" + "github.com/dotcloud/docker/pkg/libcontainer/security/restrict" + "github.com/dotcloud/docker/pkg/system" + "github.com/dotcloud/docker/utils" ) const DriverName = "lxc" @@ -27,31 +28,26 @@ func init() { if err := setupEnv(args); err != nil { return err } - if err := setupHostname(args); err != nil { return err } - if err := setupNetworking(args); err != nil { return err } - - if err := restrict.Restrict("/", "/empty"); err != nil { - return err + if !args.Privileged { + if err := restrict.Restrict(); err != nil { + return err + } } - if err := setupCapabilities(args); err != nil { return err } - if err := setupWorkingDirectory(args); err != nil { return err } - if err := system.CloseFdsFrom(3); err != nil { return err } - if err := changeUser(args); err != nil { return err } @@ -69,10 +65,9 @@ func init() { } type driver struct { - root string // root path for the driver to use - apparmor bool - sharedRoot bool - restrictionPath string + root string // root path for the driver to use + apparmor bool + sharedRoot bool } func NewDriver(root string, apparmor bool) (*driver, error) { @@ -80,15 +75,10 @@ func NewDriver(root string, apparmor bool) (*driver, error) { if err := linkLxcStart(root); err != nil { return nil, err } - restrictionPath := filepath.Join(root, "empty") - if err := os.MkdirAll(restrictionPath, 0700); err != nil { - return nil, err - } return &driver{ - apparmor: apparmor, - root: root, - sharedRoot: rootIsShared(), - restrictionPath: restrictionPath, + apparmor: apparmor, + root: root, + sharedRoot: rootIsShared(), }, nil } @@ -419,16 +409,14 @@ func (d *driver) generateLXCConfig(c *execdriver.Command) (string, error) { if err := LxcTemplateCompiled.Execute(fo, struct { *execdriver.Command - AppArmor bool - ProcessLabel string - MountLabel string - RestrictionSource string + AppArmor bool + ProcessLabel string + MountLabel string }{ - Command: c, - AppArmor: d.apparmor, - ProcessLabel: process, - MountLabel: mount, - RestrictionSource: d.restrictionPath, + Command: c, + AppArmor: d.apparmor, + ProcessLabel: process, + MountLabel: mount, }); err != nil { return "", err } diff --git a/daemon/execdriver/lxc/lxc_template.go b/daemon/execdriver/lxc/lxc_template.go index 03d32e72b5..19fa43c4c2 100644 --- a/daemon/execdriver/lxc/lxc_template.go +++ b/daemon/execdriver/lxc/lxc_template.go @@ -1,10 +1,11 @@ package lxc import ( - "github.com/dotcloud/docker/daemon/execdriver" - "github.com/dotcloud/docker/pkg/label" "strings" "text/template" + + "github.com/dotcloud/docker/daemon/execdriver" + "github.com/dotcloud/docker/pkg/label" ) const LxcTemplate = ` @@ -110,13 +111,6 @@ lxc.aa_profile = unconfined {{else}} # Let AppArmor normal confinement take place (i.e., not unconfined) {{end}} -{{else}} -# Restrict access to some stuff in /proc. Note that /proc is already mounted -# read-only, so we don't need to bother about things that are just dangerous -# to write to (like sysrq-trigger). Also, recent kernels won't let a container -# peek into /proc/kcore, but let's cater for people who might run Docker on -# older kernels. Just in case. -lxc.mount.entry = {{escapeFstabSpaces $ROOTFS}}/dev/null {{escapeFstabSpaces $ROOTFS}}/proc/kcore none bind,ro 0 0 {{end}} # limits diff --git a/daemon/execdriver/native/create.go b/daemon/execdriver/native/create.go index 6f663f916e..5562d08986 100644 --- a/daemon/execdriver/native/create.go +++ b/daemon/execdriver/native/create.go @@ -24,7 +24,7 @@ func (d *driver) createContainer(c *execdriver.Command) (*libcontainer.Container container.Cgroups.Name = c.ID // check to see if we are running in ramdisk to disable pivot root container.NoPivotRoot = os.Getenv("DOCKER_RAMDISK") != "" - container.Context["restriction_path"] = d.restrictionPath + container.Context["restrictions"] = "true" if err := d.createNetwork(container, c); err != nil { return nil, err @@ -84,7 +84,7 @@ func (d *driver) setPrivileged(container *libcontainer.Container) error { } container.Cgroups.DeviceAccess = true - delete(container.Context, "restriction_path") + delete(container.Context, "restrictions") if apparmor.IsEnabled() { container.Context["apparmor_profile"] = "unconfined" diff --git a/daemon/execdriver/native/driver.go b/daemon/execdriver/native/driver.go index a397387f11..e674d57333 100644 --- a/daemon/execdriver/native/driver.go +++ b/daemon/execdriver/native/driver.go @@ -57,7 +57,6 @@ type driver struct { root string initPath string activeContainers map[string]*exec.Cmd - restrictionPath string } func NewDriver(root, initPath string) (*driver, error) { @@ -68,14 +67,8 @@ func NewDriver(root, initPath string) (*driver, error) { if err := apparmor.InstallDefaultProfile(filepath.Join(root, "../..", BackupApparmorProfilePath)); err != nil { return nil, err } - restrictionPath := filepath.Join(root, "empty") - if err := os.MkdirAll(restrictionPath, 0700); err != nil { - return nil, err - } - return &driver{ root: root, - restrictionPath: restrictionPath, initPath: initPath, activeContainers: make(map[string]*exec.Cmd), }, nil diff --git a/pkg/libcontainer/mount/init.go b/pkg/libcontainer/mount/init.go index cc3ce2158e..6a54f2444e 100644 --- a/pkg/libcontainer/mount/init.go +++ b/pkg/libcontainer/mount/init.go @@ -123,15 +123,12 @@ func newSystemMounts(rootfs, mountLabel string, mounts libcontainer.Mounts) []mo systemMounts := []mount{ {source: "proc", path: filepath.Join(rootfs, "proc"), device: "proc", flags: defaultMountFlags}, {source: "sysfs", path: filepath.Join(rootfs, "sys"), device: "sysfs", flags: defaultMountFlags}, + {source: "shm", path: filepath.Join(rootfs, "dev", "shm"), device: "tmpfs", flags: defaultMountFlags, data: label.FormatMountLabel("mode=1777,size=65536k", mountLabel)}, + {source: "devpts", path: filepath.Join(rootfs, "dev", "pts"), device: "devpts", flags: syscall.MS_NOSUID | syscall.MS_NOEXEC, data: label.FormatMountLabel("newinstance,ptmxmode=0666,mode=620,gid=5", mountLabel)}, } if len(mounts.OfType("devtmpfs")) == 1 { systemMounts = append(systemMounts, mount{source: "tmpfs", path: filepath.Join(rootfs, "dev"), device: "tmpfs", flags: syscall.MS_NOSUID | syscall.MS_STRICTATIME, data: label.FormatMountLabel("mode=755", mountLabel)}) } - systemMounts = append(systemMounts, - mount{source: "shm", path: filepath.Join(rootfs, "dev", "shm"), device: "tmpfs", flags: defaultMountFlags, data: label.FormatMountLabel("mode=1777,size=65536k", mountLabel)}, - mount{source: "devpts", path: filepath.Join(rootfs, "dev", "pts"), device: "devpts", flags: syscall.MS_NOSUID | syscall.MS_NOEXEC, data: label.FormatMountLabel("newinstance,ptmxmode=0666,mode=620,gid=5", mountLabel)}, - ) - return systemMounts } diff --git a/pkg/libcontainer/nsinit/init.go b/pkg/libcontainer/nsinit/init.go index 90b97a9f99..755847948e 100644 --- a/pkg/libcontainer/nsinit/init.go +++ b/pkg/libcontainer/nsinit/init.go @@ -72,8 +72,8 @@ func Init(container *libcontainer.Container, uncleanRootfs, consolePath string, runtime.LockOSThread() - if restrictionPath := container.Context["restriction_path"]; restrictionPath != "" { - if err := restrict.Restrict("/", restrictionPath); err != nil { + if container.Context["restrictions"] != "" { + if err := restrict.Restrict(); err != nil { return err } } diff --git a/pkg/libcontainer/security/restrict/restrict.go b/pkg/libcontainer/security/restrict/restrict.go index a9bdc4bacb..2b7cea5a48 100644 --- a/pkg/libcontainer/security/restrict/restrict.go +++ b/pkg/libcontainer/security/restrict/restrict.go @@ -11,67 +11,42 @@ import ( "github.com/dotcloud/docker/pkg/system" ) -// "restrictions" are container paths (files, directories, whatever) that have to be masked. -// maskPath is a "safe" path to be mounted over maskedPath. It can take two special values: -// - if it is "", then nothing is mounted; -// - if it is "EMPTY", then an empty directory is mounted instead. -// If remountRO is true then the maskedPath is remounted read-only (regardless of whether a maskPath was used). -type restriction struct { - maskedPath string - maskPath string - remountRO bool -} - -var restrictions = []restriction{ - {"/proc", "", true}, - {"/sys", "", true}, - {"/proc/kcore", "/dev/null", false}, -} - // This has to be called while the container still has CAP_SYS_ADMIN (to be able to perform mounts). // However, afterwards, CAP_SYS_ADMIN should be dropped (otherwise the user will be able to revert those changes). -// "empty" should be the path to an empty directory. -func Restrict(rootfs, empty string) error { - for _, restriction := range restrictions { - dest := filepath.Join(rootfs, restriction.maskedPath) - if restriction.maskPath != "" { - var source string - if restriction.maskPath == "EMPTY" { - source = empty - } else { - source = filepath.Join(rootfs, restriction.maskPath) - } - if err := system.Mount(source, dest, "", syscall.MS_BIND, ""); err != nil { - return fmt.Errorf("unable to bind-mount %s over %s: %s", source, dest, err) - } - } - if restriction.remountRO { - if err := system.Mount("", dest, "", syscall.MS_REMOUNT|syscall.MS_RDONLY, ""); err != nil { - return fmt.Errorf("unable to remount %s readonly: %s", dest, err) - } +func Restrict() error { + // remount proc and sys as readonly + for _, dest := range []string{"proc", "sys"} { + if err := system.Mount("", dest, "", syscall.MS_REMOUNT|syscall.MS_RDONLY, ""); err != nil { + return fmt.Errorf("unable to remount %s readonly: %s", dest, err) } } + if err := system.Mount("/proc/kcore", "/dev/null", "", syscall.MS_BIND, ""); err != nil { + return fmt.Errorf("unable to bind-mount /dev/null over /proc/kcore") + } + // This weird trick will allow us to mount /proc read-only, while being able to use AppArmor. // This is because apparently, loading an AppArmor profile requires write access to /proc/1/attr. // So we do another mount of procfs, ensure it's write-able, and bind-mount a subset of it. - tmpProcPath := filepath.Join(rootfs, ".proc") - if err := os.Mkdir(tmpProcPath, 0700); err != nil { - return fmt.Errorf("unable to create temporary proc mountpoint %s: %s", tmpProcPath, err) + var ( + rwAttrPath = filepath.Join(".proc", "1", "attr") + roAttrPath = filepath.Join("proc", "1", "attr") + ) + + if err := os.Mkdir(".proc", 0700); err != nil { + return fmt.Errorf("unable to create temporary proc mountpoint .proc: %s", err) } - if err := system.Mount("proc", tmpProcPath, "proc", 0, ""); err != nil { + if err := system.Mount("proc", ".proc", "proc", 0, ""); err != nil { return fmt.Errorf("unable to mount proc on temporary proc mountpoint: %s", err) } - if err := system.Mount("proc", tmpProcPath, "", syscall.MS_REMOUNT, ""); err != nil { + if err := system.Mount("proc", ".proc", "", syscall.MS_REMOUNT, ""); err != nil { return fmt.Errorf("unable to remount proc read-write: %s", err) } - rwAttrPath := filepath.Join(rootfs, ".proc", "1", "attr") - roAttrPath := filepath.Join(rootfs, "proc", "1", "attr") if err := system.Mount(rwAttrPath, roAttrPath, "", syscall.MS_BIND, ""); err != nil { return fmt.Errorf("unable to bind-mount %s on %s: %s", rwAttrPath, roAttrPath, err) } - if err := system.Unmount(tmpProcPath, 0); err != nil { + if err := system.Unmount(".proc", 0); err != nil { return fmt.Errorf("unable to unmount temporary proc filesystem: %s", err) } - return nil + return os.RemoveAll(".proc") } diff --git a/pkg/libcontainer/security/restrict/unsupported.go b/pkg/libcontainer/security/restrict/unsupported.go index 6898baab3d..464e8d498d 100644 --- a/pkg/libcontainer/security/restrict/unsupported.go +++ b/pkg/libcontainer/security/restrict/unsupported.go @@ -4,6 +4,6 @@ package restrict import "fmt" -func Restrict(rootfs, empty string) error { +func Restrict() error { return fmt.Errorf("not supported") } From 3f74bdd93f08b3001f11a137210ee67a6d23c084 Mon Sep 17 00:00:00 2001 From: Michael Crosby Date: Thu, 1 May 2014 11:11:29 -0700 Subject: [PATCH 4/6] Mount attr and task as rw for selinux support Docker-DCO-1.1-Signed-off-by: Michael Crosby (github: crosbymichael) --- pkg/libcontainer/security/restrict/restrict.go | 11 ++++------- 1 file changed, 4 insertions(+), 7 deletions(-) diff --git a/pkg/libcontainer/security/restrict/restrict.go b/pkg/libcontainer/security/restrict/restrict.go index 2b7cea5a48..74de70aa6a 100644 --- a/pkg/libcontainer/security/restrict/restrict.go +++ b/pkg/libcontainer/security/restrict/restrict.go @@ -28,11 +28,6 @@ func Restrict() error { // This weird trick will allow us to mount /proc read-only, while being able to use AppArmor. // This is because apparently, loading an AppArmor profile requires write access to /proc/1/attr. // So we do another mount of procfs, ensure it's write-able, and bind-mount a subset of it. - var ( - rwAttrPath = filepath.Join(".proc", "1", "attr") - roAttrPath = filepath.Join("proc", "1", "attr") - ) - if err := os.Mkdir(".proc", 0700); err != nil { return fmt.Errorf("unable to create temporary proc mountpoint .proc: %s", err) } @@ -42,8 +37,10 @@ func Restrict() error { if err := system.Mount("proc", ".proc", "", syscall.MS_REMOUNT, ""); err != nil { return fmt.Errorf("unable to remount proc read-write: %s", err) } - if err := system.Mount(rwAttrPath, roAttrPath, "", syscall.MS_BIND, ""); err != nil { - return fmt.Errorf("unable to bind-mount %s on %s: %s", rwAttrPath, roAttrPath, err) + for _, path := range []string{"attr", "task"} { + if err := system.Mount(filepath.Join(".proc", "1", path), filepath.Join("proc", "1", path), "", syscall.MS_BIND, ""); err != nil { + return fmt.Errorf("unable to bind-mount %s: %s", path, err) + } } if err := system.Unmount(".proc", 0); err != nil { return fmt.Errorf("unable to unmount temporary proc filesystem: %s", err) From 24e0df8136c238cb3e231b939a82058950e6eb02 Mon Sep 17 00:00:00 2001 From: Michael Crosby Date: Thu, 1 May 2014 13:55:23 -0700 Subject: [PATCH 5/6] Fix /proc/kcore mount of /dev/null Docker-DCO-1.1-Signed-off-by: Michael Crosby (github: crosbymichael) --- pkg/libcontainer/security/restrict/restrict.go | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/pkg/libcontainer/security/restrict/restrict.go b/pkg/libcontainer/security/restrict/restrict.go index 74de70aa6a..411bc06807 100644 --- a/pkg/libcontainer/security/restrict/restrict.go +++ b/pkg/libcontainer/security/restrict/restrict.go @@ -20,8 +20,7 @@ func Restrict() error { return fmt.Errorf("unable to remount %s readonly: %s", dest, err) } } - - if err := system.Mount("/proc/kcore", "/dev/null", "", syscall.MS_BIND, ""); err != nil { + if err := system.Mount("/dev/null", "/proc/kcore", "", syscall.MS_BIND, ""); err != nil { return fmt.Errorf("unable to bind-mount /dev/null over /proc/kcore") } From 76fa7d588adfe644824d9a00dafce2d2991a7013 Mon Sep 17 00:00:00 2001 From: Michael Crosby Date: Thu, 1 May 2014 19:09:12 -0700 Subject: [PATCH 6/6] Apply apparmor before restrictions There is not need for the remount hack, we use aa_change_onexec so the apparmor profile is not applied until we exec the users app. Docker-DCO-1.1-Signed-off-by: Michael Crosby (github: crosbymichael) --- pkg/apparmor/apparmor.go | 2 +- pkg/apparmor/apparmor_disabled.go | 4 +-- pkg/libcontainer/console/console.go | 5 ++-- pkg/libcontainer/nsinit/init.go | 13 +++++----- .../security/restrict/restrict.go | 25 +------------------ 5 files changed, 12 insertions(+), 37 deletions(-) diff --git a/pkg/apparmor/apparmor.go b/pkg/apparmor/apparmor.go index 6fdb1f8958..704ee29ed0 100644 --- a/pkg/apparmor/apparmor.go +++ b/pkg/apparmor/apparmor.go @@ -20,7 +20,7 @@ func IsEnabled() bool { return false } -func ApplyProfile(pid int, name string) error { +func ApplyProfile(name string) error { if name == "" { return nil } diff --git a/pkg/apparmor/apparmor_disabled.go b/pkg/apparmor/apparmor_disabled.go index 77543e4a87..8d86ce9d4a 100644 --- a/pkg/apparmor/apparmor_disabled.go +++ b/pkg/apparmor/apparmor_disabled.go @@ -2,12 +2,10 @@ package apparmor -import () - func IsEnabled() bool { return false } -func ApplyProfile(pid int, name string) error { +func ApplyProfile(name string) error { return nil } diff --git a/pkg/libcontainer/console/console.go b/pkg/libcontainer/console/console.go index 05cd08a92e..5f06aea225 100644 --- a/pkg/libcontainer/console/console.go +++ b/pkg/libcontainer/console/console.go @@ -4,11 +4,12 @@ package console import ( "fmt" - "github.com/dotcloud/docker/pkg/label" - "github.com/dotcloud/docker/pkg/system" "os" "path/filepath" "syscall" + + "github.com/dotcloud/docker/pkg/label" + "github.com/dotcloud/docker/pkg/system" ) // Setup initializes the proper /dev/console inside the rootfs path diff --git a/pkg/libcontainer/nsinit/init.go b/pkg/libcontainer/nsinit/init.go index 755847948e..22345f603f 100644 --- a/pkg/libcontainer/nsinit/init.go +++ b/pkg/libcontainer/nsinit/init.go @@ -72,18 +72,17 @@ func Init(container *libcontainer.Container, uncleanRootfs, consolePath string, runtime.LockOSThread() + if err := apparmor.ApplyProfile(container.Context["apparmor_profile"]); err != nil { + return fmt.Errorf("set apparmor profile %s: %s", container.Context["apparmor_profile"], err) + } + if err := label.SetProcessLabel(container.Context["process_label"]); err != nil { + return fmt.Errorf("set process label %s", err) + } if container.Context["restrictions"] != "" { if err := restrict.Restrict(); err != nil { return err } } - - if err := apparmor.ApplyProfile(os.Getpid(), container.Context["apparmor_profile"]); err != nil { - return err - } - if err := label.SetProcessLabel(container.Context["process_label"]); err != nil { - return fmt.Errorf("set process label %s", err) - } if err := FinalizeNamespace(container); err != nil { return fmt.Errorf("finalize namespace %s", err) } diff --git a/pkg/libcontainer/security/restrict/restrict.go b/pkg/libcontainer/security/restrict/restrict.go index 411bc06807..cfff09f512 100644 --- a/pkg/libcontainer/security/restrict/restrict.go +++ b/pkg/libcontainer/security/restrict/restrict.go @@ -4,8 +4,6 @@ package restrict import ( "fmt" - "os" - "path/filepath" "syscall" "github.com/dotcloud/docker/pkg/system" @@ -23,26 +21,5 @@ func Restrict() error { if err := system.Mount("/dev/null", "/proc/kcore", "", syscall.MS_BIND, ""); err != nil { return fmt.Errorf("unable to bind-mount /dev/null over /proc/kcore") } - - // This weird trick will allow us to mount /proc read-only, while being able to use AppArmor. - // This is because apparently, loading an AppArmor profile requires write access to /proc/1/attr. - // So we do another mount of procfs, ensure it's write-able, and bind-mount a subset of it. - if err := os.Mkdir(".proc", 0700); err != nil { - return fmt.Errorf("unable to create temporary proc mountpoint .proc: %s", err) - } - if err := system.Mount("proc", ".proc", "proc", 0, ""); err != nil { - return fmt.Errorf("unable to mount proc on temporary proc mountpoint: %s", err) - } - if err := system.Mount("proc", ".proc", "", syscall.MS_REMOUNT, ""); err != nil { - return fmt.Errorf("unable to remount proc read-write: %s", err) - } - for _, path := range []string{"attr", "task"} { - if err := system.Mount(filepath.Join(".proc", "1", path), filepath.Join("proc", "1", path), "", syscall.MS_BIND, ""); err != nil { - return fmt.Errorf("unable to bind-mount %s: %s", path, err) - } - } - if err := system.Unmount(".proc", 0); err != nil { - return fmt.Errorf("unable to unmount temporary proc filesystem: %s", err) - } - return os.RemoveAll(".proc") + return nil }