Sign Up
Log In
Log In
or
Sign Up
Places
All Projects
Status Monitor
Collapse sidebar
openSUSE:Step:FrontRunner
docker-runc
bsc1168481-0001-cgroup-devices-major-cleanups-a...
Overview
Repositories
Revisions
Requests
Users
Attributes
Meta
File bsc1168481-0001-cgroup-devices-major-cleanups-and-minimal-transition.patch of Package docker-runc
From 607e6ef7bf4f32652f5346e72b712b92ace98f58 Mon Sep 17 00:00:00 2001 From: Aleksa Sarai <asarai@suse.de> Date: Sun, 10 May 2020 02:40:25 +1000 Subject: [PATCH] cgroup: devices: major cleanups and minimal transition rules This is a backport of [1], which resolves many issues with the implementation of the runc devices cgroup code. Among other things, it removes some of the disruptive aspects of "runc update" which customers have run into. See [1] for more details. Rather than reproducing all of the commits, this is a squashed version of the following upstream commits: * 19a3e9bf8053 ("contrib: recvtty: add --no-stdin flag") * 11736767d2c5 ("cgroups: add GetFreezerState() helper to Manager") * 4d363ccbd9cc ("cgroup: devices: eradicate the Allow/Deny lists") * 5baa34c9134b ("specconv: remove default /dev/console access") * 8ed0c4c03641 ("configs: use different types for .Devices and .Resources.Devices") * 2ef5f417d923 ("cgroups: implement a devices cgroupv1 emulator") * b70aacc28f43 ("cgroupv1: devices: use minimal transition rules with devices.Emulator") * 410aef3ef858 ("cgroups: systemd: make use of Device*= properties") [1]: https://github.com/opencontainers/runc/pull/2391 SUSE-Bugs: bsc#1168481 Signed-off-by: Aleksa Sarai <asarai@suse.de> --- contrib/cmd/recvtty/recvtty.go | 19 +- libcontainer/README.md | 5 +- libcontainer/cgroups/cgroups.go | 3 + .../cgroups/devices/devices_emulator.go | 355 ++++++ .../cgroups/devices/devices_emulator_test.go | 1096 +++++++++++++++++ .../cgroups/ebpf/devicefilter/devicefilter.go | 4 +- .../ebpf/devicefilter/devicefilter_test.go | 70 +- libcontainer/cgroups/fs/apply_raw.go | 39 +- libcontainer/cgroups/fs/devices.go | 87 +- libcontainer/cgroups/fs/devices_test.go | 91 +- libcontainer/cgroups/fs/freezer.go | 34 +- libcontainer/cgroups/fs2/devices.go | 39 +- libcontainer/cgroups/fs2/freezer.go | 62 +- libcontainer/cgroups/fs2/fs2.go | 4 + libcontainer/cgroups/systemd/apply_systemd.go | 229 +++- .../cgroups/systemd/unified_hierarchy.go | 14 + libcontainer/configs/cgroup_linux.go | 11 +- libcontainer/configs/device.go | 169 ++- libcontainer/configs/device_defaults.go | 111 -- libcontainer/container_linux.go | 22 +- libcontainer/container_linux_test.go | 5 + libcontainer/devices/devices.go | 36 +- libcontainer/integration/template_test.go | 11 +- libcontainer/rootfs_linux.go | 19 +- libcontainer/specconv/spec_linux.go | 556 +++++---- 25 files changed, 2416 insertions(+), 675 deletions(-) create mode 100644 libcontainer/cgroups/devices/devices_emulator.go create mode 100644 libcontainer/cgroups/devices/devices_emulator_test.go delete mode 100644 libcontainer/configs/device_defaults.go diff --git a/contrib/cmd/recvtty/recvtty.go b/contrib/cmd/recvtty/recvtty.go index a658b8d20202..00b30e1c39cd 100644 --- a/contrib/cmd/recvtty/recvtty.go +++ b/contrib/cmd/recvtty/recvtty.go @@ -65,7 +65,7 @@ func bail(err error) { os.Exit(1) } -func handleSingle(path string) error { +func handleSingle(path string, noStdin bool) error { // Open a socket. ln, err := net.Listen("unix", path) if err != nil { @@ -113,10 +113,12 @@ func handleSingle(path string) error { io.Copy(os.Stdout, c) quitChan <- struct{}{} }() - go func() { - io.Copy(c, os.Stdin) - quitChan <- struct{}{} - }() + if !noStdin { + go func() { + io.Copy(c, os.Stdin) + quitChan <- struct{}{} + }() + } // Only close the master fd once we've stopped copying. <-quitChan @@ -201,6 +203,10 @@ func main() { Value: "", Usage: "Path to write daemon process ID to", }, + cli.BoolFlag{ + Name: "no-stdin", + Usage: "Disable stdin handling (no-op for null mode)", + }, } app.Action = func(ctx *cli.Context) error { @@ -218,9 +224,10 @@ func main() { } } + noStdin := ctx.Bool("no-stdin") switch ctx.String("mode") { case "single": - if err := handleSingle(path); err != nil { + if err := handleSingle(path, noStdin); err != nil { return err } case "null": diff --git a/libcontainer/README.md b/libcontainer/README.md index a791ca2d2494..6803ef56c5ba 100644 --- a/libcontainer/README.md +++ b/libcontainer/README.md @@ -155,8 +155,7 @@ config := &configs.Config{ Parent: "system", Resources: &configs.Resources{ MemorySwappiness: nil, - AllowAllDevices: nil, - AllowedDevices: configs.DefaultAllowedDevices, + Devices: specconv.AllowedDevices, }, }, MaskPaths: []string{ @@ -166,7 +165,7 @@ config := &configs.Config{ ReadonlyPaths: []string{ "/proc/sys", "/proc/sysrq-trigger", "/proc/irq", "/proc/bus", }, - Devices: configs.DefaultAutoCreatedDevices, + Devices: specconv.AllowedDevices, Hostname: "testing", Mounts: []*configs.Mount{ { diff --git a/libcontainer/cgroups/cgroups.go b/libcontainer/cgroups/cgroups.go index c0a965923f9b..b4bc05c15f96 100644 --- a/libcontainer/cgroups/cgroups.go +++ b/libcontainer/cgroups/cgroups.go @@ -49,6 +49,9 @@ type Manager interface { // Gets the cgroup as configured. GetCgroups() (*configs.Cgroup, error) + + // GetFreezerState retrieves the current FreezerState of the cgroup. + GetFreezerState() (configs.FreezerState, error) } type NotFoundError struct { diff --git a/libcontainer/cgroups/devices/devices_emulator.go b/libcontainer/cgroups/devices/devices_emulator.go new file mode 100644 index 000000000000..cfb535bd4d8b --- /dev/null +++ b/libcontainer/cgroups/devices/devices_emulator.go @@ -0,0 +1,355 @@ +// +build linux + +package devices + +import ( + "bufio" + "io" + "regexp" + "sort" + "strconv" + + "github.com/opencontainers/runc/libcontainer/configs" + + "github.com/pkg/errors" +) + +// deviceMeta is a DeviceRule without the Allow or Permissions fields, and no +// wildcard-type support. It's effectively the "match" portion of a metadata +// rule, for the purposes of our emulation. +type deviceMeta struct { + node configs.DeviceType + major int64 + minor int64 +} + +// deviceRule is effectively the tuple (deviceMeta, DevicePermissions). +type deviceRule struct { + meta deviceMeta + perms configs.DevicePermissions +} + +// deviceRules is a mapping of device metadata rules to the associated +// permissions in the ruleset. +type deviceRules map[deviceMeta]configs.DevicePermissions + +func (r deviceRules) orderedEntries() []deviceRule { + var rules []deviceRule + for meta, perms := range r { + rules = append(rules, deviceRule{meta: meta, perms: perms}) + } + sort.Slice(rules, func(i, j int) bool { + // Sort by (major, minor, type). + a, b := rules[i].meta, rules[j].meta + return a.major < b.major || + (a.major == b.major && a.minor < b.minor) || + (a.major == b.major && a.minor == b.minor && a.node < b.node) + }) + return rules +} + +type Emulator struct { + defaultAllow bool + rules deviceRules +} + +func (e *Emulator) IsBlacklist() bool { + return e.defaultAllow +} + +func (e *Emulator) IsAllowAll() bool { + return e.IsBlacklist() && len(e.rules) == 0 +} + +var devicesListRegexp = regexp.MustCompile(`^([abc])\s+(\d+|\*):(\d+|\*)\s+([rwm]+)$`) + +func parseLine(line string) (*deviceRule, error) { + matches := devicesListRegexp.FindStringSubmatch(line) + if matches == nil { + return nil, errors.Errorf("line doesn't match devices.list format") + } + var ( + rule deviceRule + node = matches[1] + major = matches[2] + minor = matches[3] + perms = matches[4] + ) + + // Parse the node type. + switch node { + case "a": + // Super-special case -- "a" always means every device with every + // access mode. In fact, for devices.list this actually indicates that + // the cgroup is in black-list mode. + // TODO: Double-check that the entire file is "a *:* rwm". + return nil, nil + case "b": + rule.meta.node = configs.BlockDevice + case "c": + rule.meta.node = configs.CharDevice + default: + // Should never happen! + return nil, errors.Errorf("unknown device type %q", node) + } + + // Parse the major number. + if major == "*" { + rule.meta.major = configs.Wildcard + } else { + val, err := strconv.ParseUint(major, 10, 32) + if err != nil { + return nil, errors.Wrap(err, "parse major number") + } + rule.meta.major = int64(val) + } + + // Parse the minor number. + if minor == "*" { + rule.meta.minor = configs.Wildcard + } else { + val, err := strconv.ParseUint(minor, 10, 32) + if err != nil { + return nil, errors.Wrap(err, "parse minor number") + } + rule.meta.minor = int64(val) + } + + // Parse the access permissions. + rule.perms = configs.DevicePermissions(perms) + if !rule.perms.IsValid() || rule.perms.IsEmpty() { + // Should never happen! + return nil, errors.Errorf("parse access mode: contained unknown modes or is empty: %q", perms) + } + return &rule, nil +} + +func (e *Emulator) addRule(rule deviceRule) error { + if e.rules == nil { + e.rules = make(map[deviceMeta]configs.DevicePermissions) + } + + // Merge with any pre-existing permissions. + oldPerms := e.rules[rule.meta] + newPerms := rule.perms.Union(oldPerms) + e.rules[rule.meta] = newPerms + return nil +} + +func (e *Emulator) rmRule(rule deviceRule) error { + // Give an error if any of the permissions requested to be removed are + // present in a partially-matching wildcard rule, because such rules will + // be ignored by cgroupv1. + // + // This is a diversion from cgroupv1, but is necessary to avoid leading + // users into a false sense of security. cgroupv1 will silently(!) ignore + // requests to remove partial exceptions, but we really shouldn't do that. + // + // It may seem like we could just "split" wildcard rules which hit this + // issue, but unfortunately there are 2^32 possible major and minor + // numbers, which would exhaust kernel memory quickly if we did this. Not + // to mention it'd be really slow (the kernel side is implemented as a + // linked-list of exceptions). + for _, partialMeta := range []deviceMeta{ + {node: rule.meta.node, major: configs.Wildcard, minor: rule.meta.minor}, + {node: rule.meta.node, major: rule.meta.major, minor: configs.Wildcard}, + {node: rule.meta.node, major: configs.Wildcard, minor: configs.Wildcard}, + } { + // This wildcard rule is equivalent to the requested rule, so skip it. + if rule.meta == partialMeta { + continue + } + // Only give an error if the set of permissions overlap. + partialPerms := e.rules[partialMeta] + if !partialPerms.Intersection(rule.perms).IsEmpty() { + return errors.Errorf("requested rule [%v %v] not supported by devices cgroupv1 (cannot punch hole in existing wildcard rule [%v %v])", rule.meta, rule.perms, partialMeta, partialPerms) + } + } + + // Subtract all of the permissions listed from the full match rule. If the + // rule didn't exist, all of this is a no-op. + newPerms := e.rules[rule.meta].Difference(rule.perms) + if newPerms.IsEmpty() { + delete(e.rules, rule.meta) + } else { + e.rules[rule.meta] = newPerms + } + // TODO: The actual cgroup code doesn't care if an exception didn't exist + // during removal, so not erroring out here is /accurate/ but quite + // worrying. Maybe we should do additional validation, but again we + // have to worry about backwards-compatibility. + return nil +} + +func (e *Emulator) allow(rule *deviceRule) error { + // This cgroup is configured as a black-list. Reset the entire emulator, + // and put is into black-list mode. + if rule == nil || rule.meta.node == configs.WildcardDevice { + *e = Emulator{ + defaultAllow: true, + rules: nil, + } + return nil + } + + var err error + if e.defaultAllow { + err = errors.Wrap(e.rmRule(*rule), "remove 'deny' exception") + } else { + err = errors.Wrap(e.addRule(*rule), "add 'allow' exception") + } + return err +} + +func (e *Emulator) deny(rule *deviceRule) error { + // This cgroup is configured as a white-list. Reset the entire emulator, + // and put is into white-list mode. + if rule == nil || rule.meta.node == configs.WildcardDevice { + *e = Emulator{ + defaultAllow: false, + rules: nil, + } + return nil + } + + var err error + if e.defaultAllow { + err = errors.Wrap(e.addRule(*rule), "add 'deny' exception") + } else { + err = errors.Wrap(e.rmRule(*rule), "remove 'allow' exception") + } + return err +} + +func (e *Emulator) Apply(rule configs.DeviceRule) error { + if !rule.Type.CanCgroup() { + return errors.Errorf("cannot add rule [%#v] with non-cgroup type %q", rule, rule.Type) + } + + innerRule := &deviceRule{ + meta: deviceMeta{ + node: rule.Type, + major: rule.Major, + minor: rule.Minor, + }, + perms: rule.Permissions, + } + if innerRule.meta.node == configs.WildcardDevice { + innerRule = nil + } + + if rule.Allow { + return e.allow(innerRule) + } else { + return e.deny(innerRule) + } +} + +// EmulatorFromList takes a reader to a "devices.list"-like source, and returns +// a new Emulator that represents the state of the devices cgroup. Note that +// black-list devices cgroups cannot be fully reconstructed, due to limitations +// in the devices cgroup API. Instead, such cgroups are always treated as +// "allow all" cgroups. +func EmulatorFromList(list io.Reader) (*Emulator, error) { + // Normally cgroups are in black-list mode by default, but the way we + // figure out the current mode is whether or not devices.list has an + // allow-all rule. So we default to a white-list, and the existence of an + // "a *:* rwm" entry will tell us otherwise. + e := &Emulator{ + defaultAllow: false, + } + + // Parse the "devices.list". + s := bufio.NewScanner(list) + for s.Scan() { + line := s.Text() + deviceRule, err := parseLine(line) + if err != nil { + return nil, errors.Wrapf(err, "parsing line %q", line) + } + // "devices.list" is an allow list. Note that this means that in + // black-list mode, we have no idea what rules are in play. As a + // result, we need to be very careful in Transition(). + if err := e.allow(deviceRule); err != nil { + return nil, errors.Wrapf(err, "adding devices.list rule") + } + } + if err := s.Err(); err != nil { + return nil, errors.Wrap(err, "reading devices.list lines") + } + return e, nil +} + +// Transition calculates what is the minimally-disruptive set of rules need to +// be applied to a devices cgroup in order to transition to the given target. +// This means that any already-existing rules will not be applied, and +// disruptive rules (like denying all device access) will only be applied if +// necessary. +// +// This function is the sole reason for all of Emulator -- to allow us +// to figure out how to update a containers' cgroups without causing spurrious +// device errors (if possible). +func (source *Emulator) Transition(target *Emulator) ([]*configs.DeviceRule, error) { + var transitionRules []*configs.DeviceRule + oldRules := source.rules + + // If the default policy doesn't match, we need to include a "disruptive" + // rule (either allow-all or deny-all) in order to switch the cgroup to the + // correct default policy. + // + // However, due to a limitation in "devices.list" we cannot be sure what + // deny rules are in place in a black-list cgroup. Thus if the source is a + // black-list we also have to include a disruptive rule. + if source.IsBlacklist() || source.defaultAllow != target.defaultAllow { + transitionRules = append(transitionRules, &configs.DeviceRule{ + Type: 'a', + Major: -1, + Minor: -1, + Permissions: configs.DevicePermissions("rwm"), + Allow: target.defaultAllow, + }) + // The old rules are only relevant if we aren't starting out with a + // disruptive rule. + oldRules = nil + } + + // NOTE: We traverse through the rules in a sorted order so we always write + // the same set of rules (this is to aid testing). + + // First, we create inverse rules for any old rules not in the new set. + // This includes partial-inverse rules for specific permissions. This is a + // no-op if we added a disruptive rule, since oldRules will be empty. + for _, rule := range oldRules.orderedEntries() { + meta, oldPerms := rule.meta, rule.perms + newPerms := target.rules[meta] + droppedPerms := oldPerms.Difference(newPerms) + if !droppedPerms.IsEmpty() { + transitionRules = append(transitionRules, &configs.DeviceRule{ + Type: meta.node, + Major: meta.major, + Minor: meta.minor, + Permissions: droppedPerms, + Allow: target.defaultAllow, + }) + } + } + + // Add any additional rules which weren't in the old set. We happen to + // filter out rules which are present in both sets, though this isn't + // strictly necessary. + for _, rule := range target.rules.orderedEntries() { + meta, newPerms := rule.meta, rule.perms + oldPerms := oldRules[meta] + gainedPerms := newPerms.Difference(oldPerms) + if !gainedPerms.IsEmpty() { + transitionRules = append(transitionRules, &configs.DeviceRule{ + Type: meta.node, + Major: meta.major, + Minor: meta.minor, + Permissions: gainedPerms, + Allow: !target.defaultAllow, + }) + } + } + return transitionRules, nil +} diff --git a/libcontainer/cgroups/devices/devices_emulator_test.go b/libcontainer/cgroups/devices/devices_emulator_test.go new file mode 100644 index 000000000000..f813accef239 --- /dev/null +++ b/libcontainer/cgroups/devices/devices_emulator_test.go @@ -0,0 +1,1096 @@ +package devices + +import ( + "bytes" + "reflect" + "testing" + + "github.com/opencontainers/runc/libcontainer/configs" +) + +func TestDeviceEmulatorLoad(t *testing.T) { + tests := []struct { + name, list string + expected *Emulator + }{ + { + name: "BlacklistMode", + list: `a *:* rwm`, + expected: &Emulator{ + defaultAllow: true, + }, + }, + { + name: "WhitelistBasic", + list: `c 4:2 rw`, + expected: &Emulator{ + defaultAllow: false, + rules: deviceRules{ + { + node: configs.CharDevice, + major: 4, + minor: 2, + }: configs.DevicePermissions("rw"), + }, + }, + }, + { + name: "WhitelistWildcard", + list: `b 0:* m`, + expected: &Emulator{ + defaultAllow: false, + rules: deviceRules{ + { + node: configs.BlockDevice, + major: 0, + minor: configs.Wildcard, + }: configs.DevicePermissions("m"), + }, + }, + }, + { + name: "WhitelistDuplicate", + list: `c *:* rwm +c 1:1 r`, + expected: &Emulator{ + defaultAllow: false, + rules: deviceRules{ + { + node: configs.CharDevice, + major: configs.Wildcard, + minor: configs.Wildcard, + }: configs.DevicePermissions("rwm"), + // To match the kernel, we allow redundant rules. + { + node: configs.CharDevice, + major: 1, + minor: 1, + }: configs.DevicePermissions("r"), + }, + }, + }, + { + name: "WhitelistComplicated", + list: `c *:* m +b *:* m +c 1:3 rwm +c 1:5 rwm +c 1:7 rwm +c 1:8 rwm +c 1:9 rwm +c 5:0 rwm +c 5:2 rwm +c 136:* rwm +c 10:200 rwm`, + expected: &Emulator{ + defaultAllow: false, + rules: deviceRules{ + { + node: configs.CharDevice, + major: configs.Wildcard, + minor: configs.Wildcard, + }: configs.DevicePermissions("m"), + { + node: configs.BlockDevice, + major: configs.Wildcard, + minor: configs.Wildcard, + }: configs.DevicePermissions("m"), + { + node: configs.CharDevice, + major: 1, + minor: 3, + }: configs.DevicePermissions("rwm"), + { + node: configs.CharDevice, + major: 1, + minor: 5, + }: configs.DevicePermissions("rwm"), + { + node: configs.CharDevice, + major: 1, + minor: 7, + }: configs.DevicePermissions("rwm"), + { + node: configs.CharDevice, + major: 1, + minor: 8, + }: configs.DevicePermissions("rwm"), + { + node: configs.CharDevice, + major: 1, + minor: 9, + }: configs.DevicePermissions("rwm"), + { + node: configs.CharDevice, + major: 5, + minor: 0, + }: configs.DevicePermissions("rwm"), + { + node: configs.CharDevice, + major: 5, + minor: 2, + }: configs.DevicePermissions("rwm"), + { + node: configs.CharDevice, + major: 136, + minor: configs.Wildcard, + }: configs.DevicePermissions("rwm"), + { + node: configs.CharDevice, + major: 10, + minor: 200, + }: configs.DevicePermissions("rwm"), + }, + }, + }, + // Some invalid lists. + { + name: "InvalidFieldNumber", + list: `b 1:0`, + expected: nil, + }, + { + name: "InvalidDeviceType", + list: `p *:* rwm`, + expected: nil, + }, + { + name: "InvalidMajorNumber1", + list: `p -1:3 rwm`, + expected: nil, + }, + { + name: "InvalidMajorNumber2", + list: `c foo:27 rwm`, + expected: nil, + }, + { + name: "InvalidMinorNumber1", + list: `b 1:-4 rwm`, + expected: nil, + }, + { + name: "InvalidMinorNumber2", + list: `b 1:foo rwm`, + expected: nil, + }, + { + name: "InvalidPermissions", + list: `b 1:7 rwk`, + expected: nil, + }, + } + + for _, test := range tests { + test := test // capture range variable + t.Run(test.name, func(t *testing.T) { + list := bytes.NewBufferString(test.list) + emu, err := EmulatorFromList(list) + if err != nil && test.expected != nil { + t.Fatalf("unexpected failure when creating emulator: %v", err) + } else if err == nil && test.expected == nil { + t.Fatalf("unexpected success when creating emulator: %#v", emu) + } + + if !reflect.DeepEqual(emu, test.expected) { + t.Errorf("final emulator state mismatch: %#v != %#v", emu, test.expected) + } + }) + } +} + +func testDeviceEmulatorApply(t *testing.T, baseDefaultAllow bool) { + tests := []struct { + name string + rule configs.DeviceRule + base, expected *Emulator + }{ + // Switch between default modes. + { + name: "SwitchToOtherMode", + rule: configs.DeviceRule{ + Type: configs.WildcardDevice, + Major: configs.Wildcard, + Minor: configs.Wildcard, + Permissions: configs.DevicePermissions("rwm"), + Allow: !baseDefaultAllow, + }, + base: &Emulator{ + defaultAllow: baseDefaultAllow, + rules: deviceRules{ + { + node: configs.CharDevice, + major: configs.Wildcard, + minor: configs.Wildcard, + }: configs.DevicePermissions("rwm"), + { + node: configs.CharDevice, + major: 1, + minor: 1, + }: configs.DevicePermissions("r"), + }, + }, + expected: &Emulator{ + defaultAllow: !baseDefaultAllow, + rules: nil, + }, + }, + { + name: "SwitchToSameModeNoop", + rule: configs.DeviceRule{ + Type: configs.WildcardDevice, + Major: configs.Wildcard, + Minor: configs.Wildcard, + Permissions: configs.DevicePermissions("rwm"), + Allow: baseDefaultAllow, + }, + base: &Emulator{ + defaultAllow: baseDefaultAllow, + rules: nil, + }, + expected: &Emulator{ + defaultAllow: baseDefaultAllow, + rules: nil, + }, + }, + { + name: "SwitchToSameMode", + rule: configs.DeviceRule{ + Type: configs.WildcardDevice, + Major: configs.Wildcard, + Minor: configs.Wildcard, + Permissions: configs.DevicePermissions("rwm"), + Allow: baseDefaultAllow, + }, + base: &Emulator{ + defaultAllow: baseDefaultAllow, + rules: deviceRules{ + { + node: configs.CharDevice, + major: configs.Wildcard, + minor: configs.Wildcard, + }: configs.DevicePermissions("rwm"), + { + node: configs.CharDevice, + major: 1, + minor: 1, + }: configs.DevicePermissions("r"), + }, + }, + expected: &Emulator{ + defaultAllow: baseDefaultAllow, + rules: nil, + }, + }, + // Rule addition logic. + { + name: "RuleAdditionBasic", + rule: configs.DeviceRule{ + Type: configs.CharDevice, + Major: 42, + Minor: 1337, + Permissions: configs.DevicePermissions("rm"), + Allow: !baseDefaultAllow, + }, + base: &Emulator{ + defaultAllow: baseDefaultAllow, + rules: deviceRules{ + { + node: configs.CharDevice, + major: 2, + minor: 1, + }: configs.DevicePermissions("rwm"), + { + node: configs.BlockDevice, + major: 1, + minor: 5, + }: configs.DevicePermissions("r"), + }, + }, + expected: &Emulator{ + defaultAllow: baseDefaultAllow, + rules: deviceRules{ + { + node: configs.CharDevice, + major: 2, + minor: 1, + }: configs.DevicePermissions("rwm"), + { + node: configs.BlockDevice, + major: 1, + minor: 5, + }: configs.DevicePermissions("r"), + { + node: configs.CharDevice, + major: 42, + minor: 1337, + }: configs.DevicePermissions("rm"), + }, + }, + }, + { + name: "RuleAdditionBasicDuplicate", + rule: configs.DeviceRule{ + Type: configs.CharDevice, + Major: 42, + Minor: 1337, + Permissions: configs.DevicePermissions("rm"), + Allow: !baseDefaultAllow, + }, + base: &Emulator{ + defaultAllow: baseDefaultAllow, + rules: deviceRules{ + { + node: configs.CharDevice, + major: 42, + minor: configs.Wildcard, + }: configs.DevicePermissions("rwm"), + }, + }, + expected: &Emulator{ + defaultAllow: baseDefaultAllow, + rules: deviceRules{ + { + node: configs.CharDevice, + major: 42, + minor: configs.Wildcard, + }: configs.DevicePermissions("rwm"), + // To match the kernel, we allow redundant rules. + { + node: configs.CharDevice, + major: 42, + minor: 1337, + }: configs.DevicePermissions("rm"), + }, + }, + }, + { + name: "RuleAdditionBasicDuplicateNoop", + rule: configs.DeviceRule{ + Type: configs.CharDevice, + Major: 42, + Minor: 1337, + Permissions: configs.DevicePermissions("rm"), + Allow: !baseDefaultAllow, + }, + base: &Emulator{ + defaultAllow: baseDefaultAllow, + rules: deviceRules{ + { + node: configs.CharDevice, + major: 42, + minor: 1337, + }: configs.DevicePermissions("rm"), + }, + }, + expected: &Emulator{ + defaultAllow: baseDefaultAllow, + rules: deviceRules{ + { + node: configs.CharDevice, + major: 42, + minor: 1337, + }: configs.DevicePermissions("rm"), + }, + }, + }, + { + name: "RuleAdditionMerge", + rule: configs.DeviceRule{ + Type: configs.BlockDevice, + Major: 5, + Minor: 12, + Permissions: configs.DevicePermissions("rm"), + Allow: !baseDefaultAllow, + }, + base: &Emulator{ + defaultAllow: baseDefaultAllow, + rules: deviceRules{ + { + node: configs.CharDevice, + major: 2, + minor: 1, + }: configs.DevicePermissions("rwm"), + { + node: configs.BlockDevice, + major: 5, + minor: 12, + }: configs.DevicePermissions("rw"), + }, + }, + expected: &Emulator{ + defaultAllow: baseDefaultAllow, + rules: deviceRules{ + { + node: configs.CharDevice, + major: 2, + minor: 1, + }: configs.DevicePermissions("rwm"), + { + node: configs.BlockDevice, + major: 5, + minor: 12, + }: configs.DevicePermissions("rwm"), + }, + }, + }, + { + name: "RuleAdditionMergeWildcard", + rule: configs.DeviceRule{ + Type: configs.BlockDevice, + Major: 5, + Minor: configs.Wildcard, + Permissions: configs.DevicePermissions("rm"), + Allow: !baseDefaultAllow, + }, + base: &Emulator{ + defaultAllow: baseDefaultAllow, + rules: deviceRules{ + { + node: configs.CharDevice, + major: 2, + minor: 1, + }: configs.DevicePermissions("rwm"), + { + node: configs.BlockDevice, + major: 5, + minor: configs.Wildcard, + }: configs.DevicePermissions("rw"), + }, + }, + expected: &Emulator{ + defaultAllow: baseDefaultAllow, + rules: deviceRules{ + { + node: configs.CharDevice, + major: 2, + minor: 1, + }: configs.DevicePermissions("rwm"), + { + node: configs.BlockDevice, + major: 5, + minor: configs.Wildcard, + }: configs.DevicePermissions("rwm"), + }, + }, + }, + { + name: "RuleAdditionMergeNoop", + rule: configs.DeviceRule{ + Type: configs.BlockDevice, + Major: 5, + Minor: 12, + Permissions: configs.DevicePermissions("r"), + Allow: !baseDefaultAllow, + }, + base: &Emulator{ + defaultAllow: baseDefaultAllow, + rules: deviceRules{ + { + node: configs.CharDevice, + major: 2, + minor: 1, + }: configs.DevicePermissions("rwm"), + { + node: configs.BlockDevice, + major: 5, + minor: 12, + }: configs.DevicePermissions("rw"), + }, + }, + expected: &Emulator{ + defaultAllow: baseDefaultAllow, + rules: deviceRules{ + { + node: configs.CharDevice, + major: 2, + minor: 1, + }: configs.DevicePermissions("rwm"), + { + node: configs.BlockDevice, + major: 5, + minor: 12, + }: configs.DevicePermissions("rw"), + }, + }, + }, + // Rule removal logic. + { + name: "RuleRemovalBasic", + rule: configs.DeviceRule{ + Type: configs.CharDevice, + Major: 42, + Minor: 1337, + Permissions: configs.DevicePermissions("rm"), + Allow: baseDefaultAllow, + }, + base: &Emulator{ + defaultAllow: baseDefaultAllow, + rules: deviceRules{ + { + node: configs.CharDevice, + major: 42, + minor: 1337, + }: configs.DevicePermissions("rm"), + { + node: configs.BlockDevice, + major: 1, + minor: 5, + }: configs.DevicePermissions("r"), + }, + }, + expected: &Emulator{ + defaultAllow: baseDefaultAllow, + rules: deviceRules{ + { + node: configs.BlockDevice, + major: 1, + minor: 5, + }: configs.DevicePermissions("r"), + }, + }, + }, + { + name: "RuleRemovalNonexistent", + rule: configs.DeviceRule{ + Type: configs.CharDevice, + Major: 4, + Minor: 1, + Permissions: configs.DevicePermissions("rw"), + Allow: baseDefaultAllow, + }, + base: &Emulator{ + defaultAllow: baseDefaultAllow, + rules: deviceRules{ + { + node: configs.BlockDevice, + major: 1, + minor: 5, + }: configs.DevicePermissions("r"), + }, + }, + expected: &Emulator{ + defaultAllow: baseDefaultAllow, + rules: deviceRules{ + { + node: configs.BlockDevice, + major: 1, + minor: 5, + }: configs.DevicePermissions("r"), + }, + }, + }, + { + name: "RuleRemovalFull", + rule: configs.DeviceRule{ + Type: configs.CharDevice, + Major: 42, + Minor: 1337, + Permissions: configs.DevicePermissions("rw"), + Allow: baseDefaultAllow, + }, + base: &Emulator{ + defaultAllow: baseDefaultAllow, + rules: deviceRules{ + { + node: configs.CharDevice, + major: 42, + minor: 1337, + }: configs.DevicePermissions("w"), + { + node: configs.BlockDevice, + major: 1, + minor: 5, + }: configs.DevicePermissions("r"), + }, + }, + expected: &Emulator{ + defaultAllow: baseDefaultAllow, + rules: deviceRules{ + { + node: configs.BlockDevice, + major: 1, + minor: 5, + }: configs.DevicePermissions("r"), + }, + }, + }, + { + name: "RuleRemovalPartial", + rule: configs.DeviceRule{ + Type: configs.CharDevice, + Major: 42, + Minor: 1337, + Permissions: configs.DevicePermissions("r"), + Allow: baseDefaultAllow, + }, + base: &Emulator{ + defaultAllow: baseDefaultAllow, + rules: deviceRules{ + { + node: configs.CharDevice, + major: 42, + minor: 1337, + }: configs.DevicePermissions("rm"), + { + node: configs.BlockDevice, + major: 1, + minor: 5, + }: configs.DevicePermissions("r"), + }, + }, + expected: &Emulator{ + defaultAllow: baseDefaultAllow, + rules: deviceRules{ + { + node: configs.CharDevice, + major: 42, + minor: 1337, + }: configs.DevicePermissions("m"), + { + node: configs.BlockDevice, + major: 1, + minor: 5, + }: configs.DevicePermissions("r"), + }, + }, + }, + // Check our non-canonical behaviour when it comes to try to "punch + // out" holes in a wildcard rule. + { + name: "RuleRemovalWildcardPunchoutImpossible", + rule: configs.DeviceRule{ + Type: configs.CharDevice, + Major: 42, + Minor: 1337, + Permissions: configs.DevicePermissions("r"), + Allow: baseDefaultAllow, + }, + base: &Emulator{ + defaultAllow: baseDefaultAllow, + rules: deviceRules{ + { + node: configs.CharDevice, + major: 42, + minor: configs.Wildcard, + }: configs.DevicePermissions("rm"), + { + node: configs.CharDevice, + major: 42, + minor: 1337, + }: configs.DevicePermissions("r"), + }, + }, + expected: nil, + }, + { + name: "RuleRemovalWildcardPunchoutPossible", + rule: configs.DeviceRule{ + Type: configs.CharDevice, + Major: 42, + Minor: 1337, + Permissions: configs.DevicePermissions("r"), + Allow: baseDefaultAllow, + }, + base: &Emulator{ + defaultAllow: baseDefaultAllow, + rules: deviceRules{ + { + node: configs.CharDevice, + major: 42, + minor: configs.Wildcard, + }: configs.DevicePermissions("wm"), + { + node: configs.CharDevice, + major: 42, + minor: 1337, + }: configs.DevicePermissions("r"), + }, + }, + expected: &Emulator{ + defaultAllow: baseDefaultAllow, + rules: deviceRules{ + { + node: configs.CharDevice, + major: 42, + minor: configs.Wildcard, + }: configs.DevicePermissions("wm"), + }, + }, + }, + } + + for _, test := range tests { + test := test + t.Run(test.name, func(t *testing.T) { + err := test.base.Apply(test.rule) + if err != nil && test.expected != nil { + t.Fatalf("unexpected failure when applying apply rule: %v", err) + } else if err == nil && test.expected == nil { + t.Fatalf("unexpected success when applying apply rule: %#v", test.base) + } + + if test.expected != nil && !reflect.DeepEqual(test.base, test.expected) { + t.Errorf("final emulator state mismatch: %#v != %#v", test.base, test.expected) + } + }) + } +} + +func TestDeviceEmulatorWhitelistApply(t *testing.T) { + testDeviceEmulatorApply(t, false) +} + +func TestDeviceEmulatorBlacklistApply(t *testing.T) { + testDeviceEmulatorApply(t, true) +} + +func testDeviceEmulatorTransition(t *testing.T, sourceDefaultAllow bool) { + tests := []struct { + name string + source, target *Emulator + expected []*configs.DeviceRule + }{ + // No-op changes. + { + name: "Noop", + source: &Emulator{ + defaultAllow: sourceDefaultAllow, + rules: deviceRules{ + { + node: configs.CharDevice, + major: 42, + minor: configs.Wildcard, + }: configs.DevicePermissions("wm"), + }, + }, + target: &Emulator{ + defaultAllow: sourceDefaultAllow, + rules: deviceRules{ + { + node: configs.CharDevice, + major: 42, + minor: configs.Wildcard, + }: configs.DevicePermissions("wm"), + }, + }, + // Identical white-lists produce no extra rules. + expected: nil, + }, + // Switching modes. + { + name: "SwitchToOtherMode", + source: &Emulator{ + defaultAllow: sourceDefaultAllow, + rules: deviceRules{ + { + node: configs.CharDevice, + major: 1, + minor: 2, + }: configs.DevicePermissions("rwm"), + }, + }, + target: &Emulator{ + defaultAllow: !sourceDefaultAllow, + rules: deviceRules{ + { + node: configs.BlockDevice, + major: 42, + minor: configs.Wildcard, + }: configs.DevicePermissions("wm"), + }, + }, + expected: []*configs.DeviceRule{ + // Clear-all rule. + { + Type: configs.WildcardDevice, + Major: configs.Wildcard, + Minor: configs.Wildcard, + Permissions: configs.DevicePermissions("rwm"), + Allow: !sourceDefaultAllow, + }, + // The actual rule-set. + { + Type: configs.BlockDevice, + Major: 42, + Minor: configs.Wildcard, + Permissions: configs.DevicePermissions("wm"), + Allow: sourceDefaultAllow, + }, + }, + }, + // Rule changes. + { + name: "RuleAddition", + source: &Emulator{ + defaultAllow: sourceDefaultAllow, + rules: deviceRules{ + { + node: configs.CharDevice, + major: 1, + minor: 2, + }: configs.DevicePermissions("rwm"), + }, + }, + target: &Emulator{ + defaultAllow: sourceDefaultAllow, + rules: deviceRules{ + { + node: configs.CharDevice, + major: 1, + minor: 2, + }: configs.DevicePermissions("rwm"), + { + node: configs.BlockDevice, + major: 42, + minor: 1337, + }: configs.DevicePermissions("rwm"), + }, + }, + expected: []*configs.DeviceRule{ + { + Type: configs.BlockDevice, + Major: 42, + Minor: 1337, + Permissions: configs.DevicePermissions("rwm"), + Allow: !sourceDefaultAllow, + }, + }, + }, + { + name: "RuleRemoval", + source: &Emulator{ + defaultAllow: sourceDefaultAllow, + rules: deviceRules{ + { + node: configs.CharDevice, + major: 1, + minor: 2, + }: configs.DevicePermissions("rwm"), + { + node: configs.BlockDevice, + major: 42, + minor: 1337, + }: configs.DevicePermissions("rwm"), + }, + }, + target: &Emulator{ + defaultAllow: sourceDefaultAllow, + rules: deviceRules{ + { + node: configs.CharDevice, + major: 1, + minor: 2, + }: configs.DevicePermissions("rwm"), + }, + }, + expected: []*configs.DeviceRule{ + { + Type: configs.BlockDevice, + Major: 42, + Minor: 1337, + Permissions: configs.DevicePermissions("rwm"), + Allow: sourceDefaultAllow, + }, + }, + }, + { + name: "RuleMultipleAdditionRemoval", + source: &Emulator{ + defaultAllow: sourceDefaultAllow, + rules: deviceRules{ + { + node: configs.CharDevice, + major: 1, + minor: 2, + }: configs.DevicePermissions("rwm"), + { + node: configs.BlockDevice, + major: 3, + minor: 9, + }: configs.DevicePermissions("rw"), + }, + }, + target: &Emulator{ + defaultAllow: sourceDefaultAllow, + rules: deviceRules{ + { + node: configs.CharDevice, + major: 1, + minor: 2, + }: configs.DevicePermissions("rwm"), + }, + }, + expected: []*configs.DeviceRule{ + { + Type: configs.BlockDevice, + Major: 3, + Minor: 9, + Permissions: configs.DevicePermissions("rw"), + Allow: sourceDefaultAllow, + }, + }, + }, + // Modifying the access permissions. + { + name: "RulePartialAddition", + source: &Emulator{ + defaultAllow: sourceDefaultAllow, + rules: deviceRules{ + { + node: configs.CharDevice, + major: 1, + minor: 2, + }: configs.DevicePermissions("r"), + }, + }, + target: &Emulator{ + defaultAllow: sourceDefaultAllow, + rules: deviceRules{ + { + node: configs.CharDevice, + major: 1, + minor: 2, + }: configs.DevicePermissions("rwm"), + }, + }, + expected: []*configs.DeviceRule{ + { + Type: configs.CharDevice, + Major: 1, + Minor: 2, + Permissions: configs.DevicePermissions("wm"), + Allow: !sourceDefaultAllow, + }, + }, + }, + { + name: "RulePartialRemoval", + source: &Emulator{ + defaultAllow: sourceDefaultAllow, + rules: deviceRules{ + { + node: configs.CharDevice, + major: 1, + minor: 2, + }: configs.DevicePermissions("rw"), + }, + }, + target: &Emulator{ + defaultAllow: sourceDefaultAllow, + rules: deviceRules{ + { + node: configs.CharDevice, + major: 1, + minor: 2, + }: configs.DevicePermissions("w"), + }, + }, + expected: []*configs.DeviceRule{ + { + Type: configs.CharDevice, + Major: 1, + Minor: 2, + Permissions: configs.DevicePermissions("r"), + Allow: sourceDefaultAllow, + }, + }, + }, + { + name: "RulePartialBoth", + source: &Emulator{ + defaultAllow: sourceDefaultAllow, + rules: deviceRules{ + { + node: configs.CharDevice, + major: 1, + minor: 2, + }: configs.DevicePermissions("rw"), + }, + }, + target: &Emulator{ + defaultAllow: sourceDefaultAllow, + rules: deviceRules{ + { + node: configs.CharDevice, + major: 1, + minor: 2, + }: configs.DevicePermissions("rm"), + }, + }, + expected: []*configs.DeviceRule{ + { + Type: configs.CharDevice, + Major: 1, + Minor: 2, + Permissions: configs.DevicePermissions("w"), + Allow: sourceDefaultAllow, + }, + { + Type: configs.CharDevice, + Major: 1, + Minor: 2, + Permissions: configs.DevicePermissions("m"), + Allow: !sourceDefaultAllow, + }, + }, + }, + } + + for _, test := range tests { + test := test + t.Run(test.name, func(t *testing.T) { + // If we are in black-list mode, we need to prepend the relevant + // clear-all rule (the expected rule lists are written with + // white-list mode in mind), and then make a full copy of the + // target rules. + if sourceDefaultAllow && test.source.defaultAllow == test.target.defaultAllow { + test.expected = []*configs.DeviceRule{{ + Type: configs.WildcardDevice, + Major: configs.Wildcard, + Minor: configs.Wildcard, + Permissions: configs.DevicePermissions("rwm"), + Allow: test.target.defaultAllow, + }} + for _, rule := range test.target.rules.orderedEntries() { + test.expected = append(test.expected, &configs.DeviceRule{ + Type: rule.meta.node, + Major: rule.meta.major, + Minor: rule.meta.minor, + Permissions: rule.perms, + Allow: !test.target.defaultAllow, + }) + } + } + + rules, err := test.source.Transition(test.target) + if err != nil { + t.Fatalf("unexpected error while calculating transition rules: %#v", err) + } + + if !reflect.DeepEqual(rules, test.expected) { + t.Errorf("rules don't match expected set: %#v != %#v", rules, test.expected) + } + + // Apply the rules to the source to see if it actually transitions + // correctly. This is all emulated but it's a good thing to + // double-check. + for _, rule := range rules { + if err := test.source.Apply(*rule); err != nil { + t.Fatalf("error while applying transition rule [%#v]: %v", rule, err) + } + } + if !reflect.DeepEqual(test.source, test.target) { + t.Errorf("transition incomplete after applying all rules: %#v != %#v", test.source, test.target) + } + }) + } +} + +func TestDeviceEmulatorTransitionFromBlacklist(t *testing.T) { + testDeviceEmulatorTransition(t, true) +} + +func TestDeviceEmulatorTransitionFromWhitelist(t *testing.T) { + testDeviceEmulatorTransition(t, false) +} diff --git a/libcontainer/cgroups/ebpf/devicefilter/devicefilter.go b/libcontainer/cgroups/ebpf/devicefilter/devicefilter.go index 847ce8ef1a0d..af9275451cd8 100644 --- a/libcontainer/cgroups/ebpf/devicefilter/devicefilter.go +++ b/libcontainer/cgroups/ebpf/devicefilter/devicefilter.go @@ -22,7 +22,7 @@ const ( ) // DeviceFilter returns eBPF device filter program and its license string -func DeviceFilter(devices []*configs.Device) (asm.Instructions, string, error) { +func DeviceFilter(devices []*configs.DeviceRule) (asm.Instructions, string, error) { p := &program{} p.init() for i := len(devices) - 1; i >= 0; i-- { @@ -67,7 +67,7 @@ func (p *program) init() { } // appendDevice needs to be called from the last element of OCI linux.resources.devices to the head element. -func (p *program) appendDevice(dev *configs.Device) error { +func (p *program) appendDevice(dev *configs.DeviceRule) error { if p.blockID < 0 { return errors.New("the program is finalized") } diff --git a/libcontainer/cgroups/ebpf/devicefilter/devicefilter_test.go b/libcontainer/cgroups/ebpf/devicefilter/devicefilter_test.go index 59ff4b49bd35..885bf4db4ef5 100644 --- a/libcontainer/cgroups/ebpf/devicefilter/devicefilter_test.go +++ b/libcontainer/cgroups/ebpf/devicefilter/devicefilter_test.go @@ -20,7 +20,7 @@ func hash(s, comm string) string { return strings.Join(res, "\n") } -func testDeviceFilter(t testing.TB, devices []*configs.Device, expectedStr string) { +func testDeviceFilter(t testing.TB, devices []*configs.DeviceRule, expectedStr string) { insts, _, err := DeviceFilter(devices) if err != nil { t.Fatalf("%s: %v (devices: %+v)", t.Name(), err, devices) @@ -81,71 +81,69 @@ block-2: 18: Exit block-3: 19: JNEImm dst: r2 off: -1 imm: 2 <block-4> - 20: JNEImm dst: r4 off: -1 imm: 5 <block-4> - 21: JNEImm dst: r5 off: -1 imm: 1 <block-4> + 20: JNEImm dst: r4 off: -1 imm: 1 <block-4> + 21: JNEImm dst: r5 off: -1 imm: 9 <block-4> 22: Mov32Imm dst: r0 imm: 1 23: Exit block-4: 24: JNEImm dst: r2 off: -1 imm: 2 <block-5> 25: JNEImm dst: r4 off: -1 imm: 1 <block-5> - 26: JNEImm dst: r5 off: -1 imm: 9 <block-5> + 26: JNEImm dst: r5 off: -1 imm: 5 <block-5> 27: Mov32Imm dst: r0 imm: 1 28: Exit block-5: 29: JNEImm dst: r2 off: -1 imm: 2 <block-6> - 30: JNEImm dst: r4 off: -1 imm: 1 <block-6> - 31: JNEImm dst: r5 off: -1 imm: 5 <block-6> + 30: JNEImm dst: r4 off: -1 imm: 5 <block-6> + 31: JNEImm dst: r5 off: -1 imm: 0 <block-6> 32: Mov32Imm dst: r0 imm: 1 33: Exit block-6: 34: JNEImm dst: r2 off: -1 imm: 2 <block-7> - 35: JNEImm dst: r4 off: -1 imm: 5 <block-7> - 36: JNEImm dst: r5 off: -1 imm: 0 <block-7> + 35: JNEImm dst: r4 off: -1 imm: 1 <block-7> + 36: JNEImm dst: r5 off: -1 imm: 7 <block-7> 37: Mov32Imm dst: r0 imm: 1 38: Exit block-7: 39: JNEImm dst: r2 off: -1 imm: 2 <block-8> 40: JNEImm dst: r4 off: -1 imm: 1 <block-8> - 41: JNEImm dst: r5 off: -1 imm: 7 <block-8> + 41: JNEImm dst: r5 off: -1 imm: 8 <block-8> 42: Mov32Imm dst: r0 imm: 1 43: Exit block-8: 44: JNEImm dst: r2 off: -1 imm: 2 <block-9> 45: JNEImm dst: r4 off: -1 imm: 1 <block-9> - 46: JNEImm dst: r5 off: -1 imm: 8 <block-9> + 46: JNEImm dst: r5 off: -1 imm: 3 <block-9> 47: Mov32Imm dst: r0 imm: 1 48: Exit block-9: - 49: JNEImm dst: r2 off: -1 imm: 2 <block-10> - 50: JNEImm dst: r4 off: -1 imm: 1 <block-10> - 51: JNEImm dst: r5 off: -1 imm: 3 <block-10> - 52: Mov32Imm dst: r0 imm: 1 - 53: Exit -block-10: // (b, wildcard, wildcard, m, true) - 54: JNEImm dst: r2 off: -1 imm: 1 <block-11> - 55: Mov32Reg dst: r1 src: r3 - 56: And32Imm dst: r1 imm: 1 - 57: JEqImm dst: r1 off: -1 imm: 0 <block-11> - 58: Mov32Imm dst: r0 imm: 1 - 59: Exit -block-11: + 49: JNEImm dst: r2 off: -1 imm: 1 <block-10> + 50: Mov32Reg dst: r1 src: r3 + 51: And32Imm dst: r1 imm: 1 + 52: JEqImm dst: r1 off: -1 imm: 0 <block-10> + 53: Mov32Imm dst: r0 imm: 1 + 54: Exit +block-10: // (c, wildcard, wildcard, m, true) - 60: JNEImm dst: r2 off: -1 imm: 2 <block-12> - 61: Mov32Reg dst: r1 src: r3 - 62: And32Imm dst: r1 imm: 1 - 63: JEqImm dst: r1 off: -1 imm: 0 <block-12> - 64: Mov32Imm dst: r0 imm: 1 - 65: Exit -block-12: - 66: Mov32Imm dst: r0 imm: 0 - 67: Exit + 55: JNEImm dst: r2 off: -1 imm: 2 <block-11> + 56: Mov32Reg dst: r1 src: r3 + 57: And32Imm dst: r1 imm: 1 + 58: JEqImm dst: r1 off: -1 imm: 0 <block-11> + 59: Mov32Imm dst: r0 imm: 1 + 60: Exit +block-11: + 61: Mov32Imm dst: r0 imm: 0 + 62: Exit ` - testDeviceFilter(t, specconv.AllowedDevices, expected) + var devices []*configs.DeviceRule + for _, device := range specconv.AllowedDevices { + devices = append(devices, &device.DeviceRule) + } + testDeviceFilter(t, devices, expected) } func TestDeviceFilter_Privileged(t *testing.T) { - devices := []*configs.Device{ + devices := []*configs.DeviceRule{ { Type: 'a', Major: -1, @@ -171,7 +169,7 @@ block-0: } func TestDeviceFilter_PrivilegedExceptSingleDevice(t *testing.T) { - devices := []*configs.Device{ + devices := []*configs.DeviceRule{ { Type: 'a', Major: -1, @@ -210,7 +208,7 @@ block-1: } func TestDeviceFilter_Weird(t *testing.T) { - devices := []*configs.Device{ + devices := []*configs.DeviceRule{ { Type: 'b', Major: 8, diff --git a/libcontainer/cgroups/fs/apply_raw.go b/libcontainer/cgroups/fs/apply_raw.go index ec148b489051..11a15660beb1 100644 --- a/libcontainer/cgroups/fs/apply_raw.go +++ b/libcontainer/cgroups/fs/apply_raw.go @@ -162,9 +162,6 @@ func (m *Manager) Apply(pid int) (err error) { } for _, sys := range m.getSubsystems() { - // TODO: Apply should, ideally, be reentrant or be broken up into a separate - // create and join phase so that the cgroup hierarchy for a container can be - // created then join consists of writing the process pids to cgroup.procs p, err := d.path(sys.Name()) if err != nil { // The non-presence of the devices subsystem is @@ -177,10 +174,10 @@ func (m *Manager) Apply(pid int) (err error) { m.Paths[sys.Name()] = p if err := sys.Apply(d); err != nil { - // In the case of rootless (including euid=0 in userns), where an explicit cgroup path hasn't - // been set, we don't bail on error in case of permission problems. - // Cases where limits have been set (and we couldn't create our own - // cgroup) are handled by Set. + // In the case of rootless (including euid=0 in userns), where an + // explicit cgroup path hasn't been set, we don't bail on error in + // case of permission problems. Cases where limits have been set + // (and we couldn't create our own cgroup) are handled by Set. if isIgnorableError(m.Rootless, err) && m.Cgroups.Path == "" { delete(m.Paths, sys.Name()) continue @@ -272,22 +269,26 @@ func (m *Manager) Set(container *configs.Config) error { // Freeze toggles the container's freezer cgroup depending on the state // provided -func (m *Manager) Freeze(state configs.FreezerState) error { - if m.Cgroups == nil { +func (m *Manager) Freeze(state configs.FreezerState) (Err error) { + path := m.GetPaths()["freezer"] + if m.Cgroups == nil || path == "" { return errors.New("cannot toggle freezer: cgroups not configured for container") } - paths := m.GetPaths() - dir := paths["freezer"] prevState := m.Cgroups.Resources.Freezer m.Cgroups.Resources.Freezer = state + defer func() { + if Err != nil { + m.Cgroups.Resources.Freezer = prevState + } + }() + freezer, err := m.getSubsystems().Get("freezer") if err != nil { return err } - err = freezer.Set(dir, m.Cgroups) + err = freezer.Set(path, m.Cgroups) if err != nil { - m.Cgroups.Resources.Freezer = prevState return err } return nil @@ -409,3 +410,15 @@ func CheckCpushares(path string, c uint64) error { func (m *Manager) GetCgroups() (*configs.Cgroup, error) { return m.Cgroups, nil } + +func (m *Manager) GetFreezerState() (configs.FreezerState, error) { + paths := m.GetPaths() + dir := paths["freezer"] + freezer, err := m.getSubsystems().Get("freezer") + + // If the container doesn't have the freezer cgroup, say it's undefined. + if err != nil || dir == "" { + return configs.Undefined, nil + } + return freezer.(*FreezerGroup).GetState(dir) +} diff --git a/libcontainer/cgroups/fs/devices.go b/libcontainer/cgroups/fs/devices.go index 036c8db773e5..4fc3951de913 100644 --- a/libcontainer/cgroups/fs/devices.go +++ b/libcontainer/cgroups/fs/devices.go @@ -3,13 +3,19 @@ package fs import ( + "bytes" + "fmt" + "reflect" + "github.com/opencontainers/runc/libcontainer/cgroups" + "github.com/opencontainers/runc/libcontainer/cgroups/devices" "github.com/opencontainers/runc/libcontainer/cgroups/fscommon" "github.com/opencontainers/runc/libcontainer/configs" "github.com/opencontainers/runc/libcontainer/system" ) type DevicesGroup struct { + testingSkipFinalCheck bool } func (s *DevicesGroup) Name() string { @@ -26,49 +32,74 @@ func (s *DevicesGroup) Apply(d *cgroupData) error { return nil } +func loadEmulator(path string) (*devices.Emulator, error) { + list, err := fscommon.ReadFile(path, "devices.list") + if err != nil { + return nil, err + } + return devices.EmulatorFromList(bytes.NewBufferString(list)) +} + +func buildEmulator(rules []*configs.DeviceRule) (*devices.Emulator, error) { + // This defaults to a white-list -- which is what we want! + emu := &devices.Emulator{} + for _, rule := range rules { + if err := emu.Apply(*rule); err != nil { + return nil, err + } + } + return emu, nil +} + func (s *DevicesGroup) Set(path string, cgroup *configs.Cgroup) error { if system.RunningInUserNS() { return nil } - devices := cgroup.Resources.Devices - if len(devices) > 0 { - for _, dev := range devices { - file := "devices.deny" - if dev.Allow { - file = "devices.allow" - } - if err := fscommon.WriteFile(path, file, dev.CgroupString()); err != nil { - return err - } - } - return nil + // Generate two emulators, one for the current state of the cgroup and one + // for the requested state by the user. + current, err := loadEmulator(path) + if err != nil { + return err + } + target, err := buildEmulator(cgroup.Resources.Devices) + if err != nil { + return err } - if cgroup.Resources.AllowAllDevices != nil { - if *cgroup.Resources.AllowAllDevices == false { - if err := fscommon.WriteFile(path, "devices.deny", "a"); err != nil { - return err - } - for _, dev := range cgroup.Resources.AllowedDevices { - if err := fscommon.WriteFile(path, "devices.allow", dev.CgroupString()); err != nil { - return err - } - } - return nil + // Compute the minimal set of transition rules needed to achieve the + // requested state. + transitionRules, err := current.Transition(target) + if err != nil { + return err + } + for _, rule := range transitionRules { + file := "devices.deny" + if rule.Allow { + file = "devices.allow" } - - if err := fscommon.WriteFile(path, "devices.allow", "a"); err != nil { + if err := fscommon.WriteFile(path, file, rule.CgroupString()); err != nil { return err } } - for _, dev := range cgroup.Resources.DeniedDevices { - if err := fscommon.WriteFile(path, "devices.deny", dev.CgroupString()); err != nil { + // Final safety check -- ensure that the resulting state is what was + // requested. This is only really correct for white-lists, but for + // black-lists we can at least check that the cgroup is in the right mode. + // + // This safety-check is skipped for the unit tests because we cannot + // currently mock devices.list correctly. + if !s.testingSkipFinalCheck { + currentAfter, err := loadEmulator(path) + if err != nil { return err } + if !target.IsBlacklist() && !reflect.DeepEqual(currentAfter, target) { + return fmt.Errorf("resulting devices cgroup doesn't precisely match target") + } else if target.IsBlacklist() != currentAfter.IsBlacklist() { + return fmt.Errorf("resulting devices cgroup doesn't match target mode") + } } - return nil } diff --git a/libcontainer/cgroups/fs/devices_test.go b/libcontainer/cgroups/fs/devices_test.go index 648f4a2ff47b..da7ccb01ecc1 100644 --- a/libcontainer/cgroups/fs/devices_test.go +++ b/libcontainer/cgroups/fs/devices_test.go @@ -9,91 +9,44 @@ import ( "github.com/opencontainers/runc/libcontainer/configs" ) -var ( - allowedDevices = []*configs.Device{ - { - Path: "/dev/zero", - Type: 'c', - Major: 1, - Minor: 5, - Permissions: "rwm", - FileMode: 0666, - }, - } - allowedList = "c 1:5 rwm" - deniedDevices = []*configs.Device{ - { - Path: "/dev/null", - Type: 'c', - Major: 1, - Minor: 3, - Permissions: "rwm", - FileMode: 0666, - }, - } - deniedList = "c 1:3 rwm" -) - func TestDevicesSetAllow(t *testing.T) { helper := NewCgroupTestUtil("devices", t) defer helper.cleanup() helper.writeFileContents(map[string]string{ - "devices.deny": "a", + "devices.allow": "", + "devices.deny": "", + "devices.list": "a *:* rwm", }) - allowAllDevices := false - helper.CgroupData.config.Resources.AllowAllDevices = &allowAllDevices - helper.CgroupData.config.Resources.AllowedDevices = allowedDevices - devices := &DevicesGroup{} - if err := devices.Set(helper.CgroupPath, helper.CgroupData.config); err != nil { - t.Fatal(err) - } - - value, err := fscommon.GetCgroupParamString(helper.CgroupPath, "devices.allow") - if err != nil { - t.Fatalf("Failed to parse devices.allow - %s", err) - } - - if value != allowedList { - t.Fatal("Got the wrong value, set devices.allow failed.") - } - // When AllowAllDevices is nil, devices.allow file should not be modified. - helper.CgroupData.config.Resources.AllowAllDevices = nil - if err := devices.Set(helper.CgroupPath, helper.CgroupData.config); err != nil { - t.Fatal(err) - } - value, err = fscommon.GetCgroupParamString(helper.CgroupPath, "devices.allow") - if err != nil { - t.Fatalf("Failed to parse devices.allow - %s", err) - } - if value != allowedList { - t.Fatal("devices policy shouldn't have changed on AllowedAllDevices=nil.") + helper.CgroupData.config.Resources.Devices = []*configs.DeviceRule{ + { + Type: configs.CharDevice, + Major: 1, + Minor: 5, + Permissions: configs.DevicePermissions("rwm"), + Allow: true, + }, } -} - -func TestDevicesSetDeny(t *testing.T) { - helper := NewCgroupTestUtil("devices", t) - defer helper.cleanup() - helper.writeFileContents(map[string]string{ - "devices.allow": "a", - }) - - allowAllDevices := true - helper.CgroupData.config.Resources.AllowAllDevices = &allowAllDevices - helper.CgroupData.config.Resources.DeniedDevices = deniedDevices - devices := &DevicesGroup{} + devices := &DevicesGroup{testingSkipFinalCheck: true} if err := devices.Set(helper.CgroupPath, helper.CgroupData.config); err != nil { t.Fatal(err) } + // The default deny rule must be written. value, err := fscommon.GetCgroupParamString(helper.CgroupPath, "devices.deny") if err != nil { - t.Fatalf("Failed to parse devices.deny - %s", err) + t.Fatalf("Failed to parse devices.deny: %s", err) + } + if value[0] != 'a' { + t.Errorf("Got the wrong value (%q), set devices.deny failed.", value) } - if value != deniedList { - t.Fatal("Got the wrong value, set devices.deny failed.") + // Permitted rule must be written. + if value, err := fscommon.GetCgroupParamString(helper.CgroupPath, "devices.allow"); err != nil { + t.Fatalf("Failed to parse devices.allow: %s", err) + } else if value != "c 1:5 rwm" { + t.Errorf("Got the wrong value (%q), set devices.allow failed.", value) } } diff --git a/libcontainer/cgroups/fs/freezer.go b/libcontainer/cgroups/fs/freezer.go index 9dc81bdb84c3..a6c622958c8c 100644 --- a/libcontainer/cgroups/fs/freezer.go +++ b/libcontainer/cgroups/fs/freezer.go @@ -4,12 +4,15 @@ package fs import ( "fmt" + "os" "strings" "time" "github.com/opencontainers/runc/libcontainer/cgroups" "github.com/opencontainers/runc/libcontainer/cgroups/fscommon" "github.com/opencontainers/runc/libcontainer/configs" + "github.com/pkg/errors" + "golang.org/x/sys/unix" ) type FreezerGroup struct { @@ -39,11 +42,11 @@ func (s *FreezerGroup) Set(path string, cgroup *configs.Cgroup) error { return err } - state, err := fscommon.ReadFile(path, "freezer.state") + state, err := s.GetState(path) if err != nil { return err } - if strings.TrimSpace(state) == string(cgroup.Resources.Freezer) { + if state == cgroup.Resources.Freezer { break } @@ -65,3 +68,30 @@ func (s *FreezerGroup) Remove(d *cgroupData) error { func (s *FreezerGroup) GetStats(path string, stats *cgroups.Stats) error { return nil } + +func (s *FreezerGroup) GetState(path string) (configs.FreezerState, error) { + for { + state, err := fscommon.ReadFile(path, "freezer.state") + if err != nil { + // If the kernel is too old, then we just treat the freezer as + // being in an "undefined" state. + if os.IsNotExist(err) || errors.Cause(err) == unix.ENODEV { + err = nil + } + return configs.Undefined, err + } + switch strings.TrimSpace(state) { + case "THAWED": + return configs.Thawed, nil + case "FROZEN": + return configs.Frozen, nil + case "FREEZING": + // Make sure we get a stable freezer state, so retry if the cgroup + // is still undergoing freezing. This should be a temporary delay. + time.Sleep(1 * time.Millisecond) + continue + default: + return configs.Undefined, fmt.Errorf("unknown freezer.state %q", state) + } + } +} diff --git a/libcontainer/cgroups/fs2/devices.go b/libcontainer/cgroups/fs2/devices.go index e0fd685402a2..ee180bdaf50b 100644 --- a/libcontainer/cgroups/fs2/devices.go +++ b/libcontainer/cgroups/fs2/devices.go @@ -10,12 +10,10 @@ import ( "golang.org/x/sys/unix" ) -func isRWM(cgroupPermissions string) bool { - r := false - w := false - m := false - for _, rn := range cgroupPermissions { - switch rn { +func isRWM(perms configs.DevicePermissions) bool { + var r, w, m bool + for _, perm := range perms { + switch perm { case 'r': r = true case 'w': @@ -39,22 +37,10 @@ func canSkipEBPFError(cgroup *configs.Cgroup) bool { } func setDevices(dirPath string, cgroup *configs.Cgroup) error { + // XXX: This is currently a white-list (but all callers pass a blacklist of + // devices). This is bad for a whole variety of reasons, but will need + // to be fixed with co-ordinated effort with downstreams. devices := cgroup.Devices - if allowAllDevices := cgroup.Resources.AllowAllDevices; allowAllDevices != nil { - // never set by OCI specconv, but *allowAllDevices=false is still used by the integration test - if *allowAllDevices == true { - return errors.New("libcontainer AllowAllDevices is not supported, use Devices") - } - for _, ad := range cgroup.Resources.AllowedDevices { - d := *ad - d.Allow = true - devices = append(devices, &d) - } - } - if len(cgroup.Resources.DeniedDevices) != 0 { - // never set by OCI specconv - return errors.New("libcontainer DeniedDevices is not supported, use Devices") - } insts, license, err := devicefilter.DeviceFilter(devices) if err != nil { return err @@ -64,6 +50,17 @@ func setDevices(dirPath string, cgroup *configs.Cgroup) error { return errors.Errorf("cannot get dir FD for %s", dirPath) } defer unix.Close(dirFD) + // XXX: This code is currently incorrect when it comes to updating an + // existing cgroup with new rules (new rulesets are just appended to + // the program list because this uses BPF_F_ALLOW_MULTI). If we didn't + // use BPF_F_ALLOW_MULTI we could actually atomically swap the + // programs. + // + // The real issue is that BPF_F_ALLOW_MULTI makes it hard to have a + // race-free blacklist because it acts as a whitelist by default, and + // having a deny-everything program cannot be overriden by other + // programs. You could temporarily insert a deny-everything program + // but that would result in spurrious failures during updates. if _, err := ebpf.LoadAttachCgroupDeviceFilter(insts, license, dirFD); err != nil { if !canSkipEBPFError(cgroup) { return err diff --git a/libcontainer/cgroups/fs2/freezer.go b/libcontainer/cgroups/fs2/freezer.go index 130c63f3e2bf..3dd4a527bf0f 100644 --- a/libcontainer/cgroups/fs2/freezer.go +++ b/libcontainer/cgroups/fs2/freezer.go @@ -3,32 +3,48 @@ package fs2 import ( - "strconv" + "os" "strings" "github.com/opencontainers/runc/libcontainer/cgroups/fscommon" "github.com/opencontainers/runc/libcontainer/configs" "github.com/pkg/errors" + "golang.org/x/sys/unix" ) func setFreezer(dirPath string, state configs.FreezerState) error { - var desired int + if err := supportsFreezer(dirPath); err != nil { + // We can ignore this request as long as the user didn't ask us to + // freeze the container (since without the freezer cgroup, that's a + // no-op). + if state == configs.Undefined || state == configs.Thawed { + err = nil + } + return errors.Wrap(err, "freezer not supported") + } + + var stateStr string switch state { case configs.Undefined: return nil case configs.Frozen: - desired = 1 + stateStr = "1" case configs.Thawed: - desired = 0 + stateStr = "0" default: - return errors.Errorf("unknown freezer state %+v", state) + return errors.Errorf("invalid freezer state %q requested", state) + } + + if err := fscommon.WriteFile(dirPath, "cgroup.freeze", stateStr); err != nil { + return err } - supportedErr := supportsFreezer(dirPath) - if supportedErr != nil && desired != 0 { - // can ignore error if desired == 1 - return errors.Wrap(supportedErr, "freezer not supported") + // Confirm that the cgroup did actually change states. + if actualState, err := getFreezer(dirPath); err != nil { + return err + } else if actualState != state { + return errors.Errorf(`expected "cgroup.freeze" to be in state %q but was in %q`, state, actualState) } - return freezeWithInt(dirPath, desired) + return nil } func supportsFreezer(dirPath string) error { @@ -36,18 +52,22 @@ func supportsFreezer(dirPath string) error { return err } -// freeze writes desired int to "cgroup.freeze". -func freezeWithInt(dirPath string, desired int) error { - desiredS := strconv.Itoa(desired) - if err := fscommon.WriteFile(dirPath, "cgroup.freeze", desiredS); err != nil { - return err - } - got, err := fscommon.ReadFile(dirPath, "cgroup.freeze") +func getFreezer(dirPath string) (configs.FreezerState, error) { + state, err := fscommon.ReadFile(dirPath, "cgroup.freeze") if err != nil { - return err + // If the kernel is too old, then we just treat the freezer as being in + // an "undefined" state. + if os.IsNotExist(err) || errors.Cause(err) == unix.ENODEV { + err = nil + } + return configs.Undefined, err } - if gotS := strings.TrimSpace(string(got)); gotS != desiredS { - return errors.Errorf("expected \"cgroup.freeze\" in %q to be %q, got %q", dirPath, desiredS, gotS) + switch strings.TrimSpace(state) { + case "0": + return configs.Thawed, nil + case "1": + return configs.Frozen, nil + default: + return configs.Undefined, errors.Errorf(`unknown "cgroup.freeze" state: %q`, state) } - return nil } diff --git a/libcontainer/cgroups/fs2/fs2.go b/libcontainer/cgroups/fs2/fs2.go index 4bb7091a1832..82d1d08430c7 100644 --- a/libcontainer/cgroups/fs2/fs2.go +++ b/libcontainer/cgroups/fs2/fs2.go @@ -212,3 +212,7 @@ func (m *manager) Set(container *configs.Config) error { func (m *manager) GetCgroups() (*configs.Cgroup, error) { return m.config, nil } + +func (m *manager) GetFreezerState() (configs.FreezerState, error) { + return getFreezer(m.dirPath) +} diff --git a/libcontainer/cgroups/systemd/apply_systemd.go b/libcontainer/cgroups/systemd/apply_systemd.go index c4b19b3e6d4f..e2f94d1b1254 100644 --- a/libcontainer/cgroups/systemd/apply_systemd.go +++ b/libcontainer/cgroups/systemd/apply_systemd.go @@ -3,7 +3,8 @@ package systemd import ( - "errors" + "bufio" + stdErrors "errors" "fmt" "io/ioutil" "math" @@ -16,8 +17,10 @@ import ( systemdDbus "github.com/coreos/go-systemd/dbus" "github.com/godbus/dbus" "github.com/opencontainers/runc/libcontainer/cgroups" + "github.com/opencontainers/runc/libcontainer/cgroups/devices" "github.com/opencontainers/runc/libcontainer/cgroups/fs" "github.com/opencontainers/runc/libcontainer/configs" + "github.com/pkg/errors" "github.com/sirupsen/logrus" ) @@ -36,7 +39,7 @@ type subsystem interface { Set(path string, cgroup *configs.Cgroup) error } -var errSubsystemDoesNotExist = errors.New("cgroup: subsystem does not exist") +var errSubsystemDoesNotExist = stdErrors.New("cgroup: subsystem does not exist") type subsystemSet []subsystem @@ -70,6 +73,209 @@ const ( testSliceWait = 4 ) +func groupPrefix(ruleType configs.DeviceType) (string, error) { + switch ruleType { + case configs.BlockDevice: + return "block-", nil + case configs.CharDevice: + return "char-", nil + default: + return "", errors.Errorf("device type %v has no group prefix", ruleType) + } +} + +// findDeviceGroup tries to find the device group name (as listed in +// /proc/devices) with the type prefixed as requried for DeviceAllow, for a +// given (type, major) combination. If more than one device group exists, an +// arbitrary one is chosen. +func findDeviceGroup(ruleType configs.DeviceType, ruleMajor int64) (string, error) { + fh, err := os.Open("/proc/devices") + if err != nil { + return "", err + } + defer fh.Close() + + prefix, err := groupPrefix(ruleType) + if err != nil { + return "", err + } + + scanner := bufio.NewScanner(fh) + var currentType configs.DeviceType + for scanner.Scan() { + // We need to strip spaces because the first number is column-aligned. + line := strings.TrimSpace(scanner.Text()) + + // Handle the "header" lines. + switch line { + case "Block devices:": + currentType = configs.BlockDevice + continue + case "Character devices:": + currentType = configs.CharDevice + continue + case "": + continue + } + + // Skip lines unrelated to our type. + if currentType != ruleType { + continue + } + + // Parse out the (major, name). + var ( + currMajor int64 + currName string + ) + if n, err := fmt.Sscanf(line, "%d %s", &currMajor, &currName); err != nil || n != 2 { + if err == nil { + err = errors.Errorf("wrong number of fields") + } + return "", errors.Wrapf(err, "scan /proc/devices line %q", line) + } + + if currMajor == ruleMajor { + return prefix + currName, nil + } + } + if err := scanner.Err(); err != nil { + return "", errors.Wrap(err, "reading /proc/devices") + } + // Couldn't find the device group. + return "", nil +} + +// generateDeviceProperties takes the configured device rules and generates a +// corresponding set of systemd properties to configure the devices correctly. +func generateDeviceProperties(rules []*configs.DeviceRule) ([]systemdDbus.Property, error) { + // DeviceAllow is the type "a(ss)" which means we need a temporary struct + // to represent it in Go. + type deviceAllowEntry struct { + Path string + Perms string + } + + properties := []systemdDbus.Property{ + // Always run in the strictest white-list mode. + newProp("DevicePolicy", "strict"), + // Empty the DevicesAllow array before filling it. + newProp("DeviceAllow", []deviceAllowEntry{}), + } + + // Figure out the set of rules. + configEmu := &devices.Emulator{} + for _, rule := range rules { + if err := configEmu.Apply(*rule); err != nil { + return nil, errors.Wrap(err, "apply rule for systemd") + } + } + // systemd doesn't support blacklists. So we log a warning, and tell + // systemd to act as a deny-all whitelist. This ruleset will be replaced + // with our normal fallback code. This may result in spurrious errors, but + // the only other option is to error out here. + if configEmu.IsBlacklist() { + // However, if we're dealing with an allow-all rule then we can do it. + if configEmu.IsAllowAll() { + return []systemdDbus.Property{ + // Run in white-list mode by setting to "auto" and removing all + // DeviceAllow rules. + newProp("DevicePolicy", "auto"), + newProp("DeviceAllow", []deviceAllowEntry{}), + }, nil + } + logrus.Warn("systemd doesn't support blacklist device rules -- applying temporary deny-all rule") + return properties, nil + } + + // Now generate the set of rules we actually need to apply. Unlike the + // normal devices cgroup, in "strict" mode systemd defaults to a deny-all + // whitelist which is the default for devices.Emulator. + baseEmu := &devices.Emulator{} + finalRules, err := baseEmu.Transition(configEmu) + if err != nil { + return nil, errors.Wrap(err, "get simplified rules for systemd") + } + var deviceAllowList []deviceAllowEntry + for _, rule := range finalRules { + if !rule.Allow { + // Should never happen. + return nil, errors.Errorf("[internal error] cannot add deny rule to systemd DeviceAllow list: %v", *rule) + } + switch rule.Type { + case configs.BlockDevice, configs.CharDevice: + default: + // Should never happen. + return nil, errors.Errorf("invalid device type for DeviceAllow: %v", rule.Type) + } + + entry := deviceAllowEntry{ + Perms: string(rule.Permissions), + } + + // systemd has a fairly odd (though understandable) syntax here, and + // because of the OCI configuration format we have to do quite a bit of + // trickery to convert things: + // + // * Concrete rules with non-wildcard major/minor numbers have to use + // /dev/{block,char} paths. This is slightly odd because it means + // that we cannot add whitelist rules for devices that don't exist, + // but there's not too much we can do about that. + // + // However, path globbing is not support for path-based rules so we + // need to handle wildcards in some other manner. + // + // * Wildcard-minor rules have to specify a "device group name" (the + // second column in /proc/devices). + // + // * Wildcard (major and minor) rules can just specify a glob with the + // type ("char-*" or "block-*"). + // + // The only type of rule we can't handle is wildcard-major rules, and + // so we'll give a warning in that case (note that the fallback code + // will insert any rules systemd couldn't handle). What amazing fun. + + if rule.Major == configs.Wildcard { + // "_ *:n _" rules aren't supported by systemd. + if rule.Minor != configs.Wildcard { + logrus.Warnf("systemd doesn't support '*:n' device rules -- temporarily ignoring rule: %v", *rule) + continue + } + + // "_ *:* _" rules just wildcard everything. + prefix, err := groupPrefix(rule.Type) + if err != nil { + return nil, err + } + entry.Path = prefix + "*" + } else if rule.Minor == configs.Wildcard { + // "_ n:* _" rules require a device group from /proc/devices. + group, err := findDeviceGroup(rule.Type, rule.Major) + if err != nil { + return nil, errors.Wrapf(err, "find device '%v/%d'", rule.Type, rule.Major) + } + if group == "" { + // Couldn't find a group. + logrus.Warnf("could not find device group for '%v/%d' in /proc/devices -- temporarily ignoring rule: %v", rule.Type, rule.Major, *rule) + continue + } + entry.Path = group + } else { + // "_ n:m _" rules are just a path in /dev/{block,char}/. + switch rule.Type { + case configs.BlockDevice: + entry.Path = fmt.Sprintf("/dev/block/%d:%d", rule.Major, rule.Minor) + case configs.CharDevice: + entry.Path = fmt.Sprintf("/dev/char/%d:%d", rule.Major, rule.Minor) + } + } + deviceAllowList = append(deviceAllowList, entry) + } + + properties = append(properties, newProp("DeviceAllow", deviceAllowList)) + return properties, nil +} + var ( connLock sync.Mutex theConn *systemdDbus.Conn @@ -196,6 +402,12 @@ func (m *LegacyManager) Apply(pid int) error { properties = append(properties, newProp("DefaultDependencies", false)) + deviceProperties, err := generateDeviceProperties(c.Resources.Devices) + if err != nil { + return err + } + properties = append(properties, deviceProperties...) + if c.Resources.Memory != 0 { properties = append(properties, newProp("MemoryLimit", uint64(c.Resources.Memory))) @@ -476,7 +688,6 @@ func (m *LegacyManager) Set(container *configs.Config) error { if err != nil && !cgroups.IsNotFound(err) { return err } - if err := sys.Set(path, container.Cgroups); err != nil { return err } @@ -532,3 +743,15 @@ func isUnitExists(err error) bool { func (m *LegacyManager) GetCgroups() (*configs.Cgroup, error) { return m.Cgroups, nil } + +func (m *LegacyManager) GetFreezerState() (configs.FreezerState, error) { + path, err := getSubsystemPath(m.Cgroups, "freezer") + if err != nil && !cgroups.IsNotFound(err) { + return configs.Undefined, err + } + freezer, err := legacySubsystems.Get("freezer") + if err != nil { + return configs.Undefined, err + } + return freezer.(*fs.FreezerGroup).GetState(path) +} diff --git a/libcontainer/cgroups/systemd/unified_hierarchy.go b/libcontainer/cgroups/systemd/unified_hierarchy.go index 6605099190ca..2bfcccd130ac 100644 --- a/libcontainer/cgroups/systemd/unified_hierarchy.go +++ b/libcontainer/cgroups/systemd/unified_hierarchy.go @@ -87,6 +87,12 @@ func (m *UnifiedManager) Apply(pid int) error { properties = append(properties, newProp("DefaultDependencies", false)) + deviceProperties, err := generateDeviceProperties(c.Resources.Devices) + if err != nil { + return err + } + properties = append(properties, deviceProperties...) + if c.Resources.Memory != 0 { properties = append(properties, newProp("MemoryLimit", uint64(c.Resources.Memory))) @@ -310,3 +316,11 @@ func (m *UnifiedManager) Set(container *configs.Config) error { func (m *UnifiedManager) GetCgroups() (*configs.Cgroup, error) { return m.Cgroups, nil } + +func (m *UnifiedManager) GetFreezerState() (configs.FreezerState, error) { + fsMgr, err := m.fsManager() + if err != nil { + return configs.Undefined, err + } + return fsMgr.GetFreezerState() +} diff --git a/libcontainer/configs/cgroup_linux.go b/libcontainer/configs/cgroup_linux.go index 58ed19c9e78e..e83bb72dc334 100644 --- a/libcontainer/configs/cgroup_linux.go +++ b/libcontainer/configs/cgroup_linux.go @@ -32,15 +32,8 @@ type Cgroup struct { } type Resources struct { - // If this is true allow access to any kind of device within the container. If false, allow access only to devices explicitly listed in the allowed_devices list. - // Deprecated - AllowAllDevices *bool `json:"allow_all_devices,omitempty"` - // Deprecated - AllowedDevices []*Device `json:"allowed_devices,omitempty"` - // Deprecated - DeniedDevices []*Device `json:"denied_devices,omitempty"` - - Devices []*Device `json:"devices"` + // Devices is the set of access rules for devices in the container. + Devices []*DeviceRule `json:"devices"` // Memory limit (in bytes) Memory int64 `json:"memory"` diff --git a/libcontainer/configs/device.go b/libcontainer/configs/device.go index 8701bb212d7b..24c5bbfa6ae8 100644 --- a/libcontainer/configs/device.go +++ b/libcontainer/configs/device.go @@ -1,8 +1,12 @@ package configs import ( + "errors" "fmt" "os" + "strconv" + + "golang.org/x/sys/unix" ) const ( @@ -12,21 +16,11 @@ const ( // TODO Windows: This can be factored out in the future type Device struct { - // Device type, block, char, etc. - Type rune `json:"type"` + DeviceRule // Path to the device. Path string `json:"path"` - // Major is the device's major number. - Major int64 `json:"major"` - - // Minor is the device's minor number. - Minor int64 `json:"minor"` - - // Cgroup permissions format, rwm. - Permissions string `json:"permissions"` - // FileMode permission bits for the device. FileMode os.FileMode `json:"file_mode"` @@ -35,23 +29,154 @@ type Device struct { // Gid of the device. Gid uint32 `json:"gid"` +} - // Write the file to the allowed list - Allow bool `json:"allow"` +// DevicePermissions is a cgroupv1-style string to represent device access. It +// has to be a string for backward compatibility reasons, hence why it has +// methods to do set operations. +type DevicePermissions string + +const ( + deviceRead uint = (1 << iota) + deviceWrite + deviceMknod +) + +func (p DevicePermissions) toSet() uint { + var set uint + for _, perm := range p { + switch perm { + case 'r': + set |= deviceRead + case 'w': + set |= deviceWrite + case 'm': + set |= deviceMknod + } + } + return set +} + +func fromSet(set uint) DevicePermissions { + var perm string + if set&deviceRead == deviceRead { + perm += "r" + } + if set&deviceWrite == deviceWrite { + perm += "w" + } + if set&deviceMknod == deviceMknod { + perm += "m" + } + return DevicePermissions(perm) +} + +// Union returns the union of the two sets of DevicePermissions. +func (p DevicePermissions) Union(o DevicePermissions) DevicePermissions { + lhs := p.toSet() + rhs := o.toSet() + return fromSet(lhs | rhs) +} + +// Difference returns the set difference of the two sets of DevicePermissions. +// In set notation, A.Difference(B) gives you A\B. +func (p DevicePermissions) Difference(o DevicePermissions) DevicePermissions { + lhs := p.toSet() + rhs := o.toSet() + return fromSet(lhs &^ rhs) +} + +// Intersection computes the intersection of the two sets of DevicePermissions. +func (p DevicePermissions) Intersection(o DevicePermissions) DevicePermissions { + lhs := p.toSet() + rhs := o.toSet() + return fromSet(lhs & rhs) +} + +// IsEmpty returns whether the set of permissions in a DevicePermissions is +// empty. +func (p DevicePermissions) IsEmpty() bool { + return p == DevicePermissions("") +} + +// IsValid returns whether the set of permissions is a subset of valid +// permissions (namely, {r,w,m}). +func (p DevicePermissions) IsValid() bool { + return p == fromSet(p.toSet()) +} + +type DeviceType rune + +const ( + WildcardDevice DeviceType = 'a' + BlockDevice DeviceType = 'b' + CharDevice DeviceType = 'c' // or 'u' + FifoDevice DeviceType = 'p' +) + +func (t DeviceType) IsValid() bool { + switch t { + case WildcardDevice, BlockDevice, CharDevice, FifoDevice: + return true + default: + return false + } } -func (d *Device) CgroupString() string { - return fmt.Sprintf("%c %s:%s %s", d.Type, deviceNumberString(d.Major), deviceNumberString(d.Minor), d.Permissions) +func (t DeviceType) CanMknod() bool { + switch t { + case BlockDevice, CharDevice, FifoDevice: + return true + default: + return false + } +} + +func (t DeviceType) CanCgroup() bool { + switch t { + case WildcardDevice, BlockDevice, CharDevice: + return true + default: + return false + } } -func (d *Device) Mkdev() int { - return int((d.Major << 8) | (d.Minor & 0xff) | ((d.Minor & 0xfff00) << 12)) +type DeviceRule struct { + // Type of device ('c' for char, 'b' for block). If set to 'a', this rule + // acts as a wildcard and all fields other than Allow are ignored. + Type DeviceType `json:"type"` + + // Major is the device's major number. + Major int64 `json:"major"` + + // Minor is the device's minor number. + Minor int64 `json:"minor"` + + // Permissions is the set of permissions that this rule applies to (in the + // cgroupv1 format -- any combination of "rwm"). + Permissions DevicePermissions `json:"permissions"` + + // Allow specifies whether this rule is allowed. + Allow bool `json:"allow"` +} + +func (d *DeviceRule) CgroupString() string { + var ( + major = strconv.FormatInt(d.Major, 10) + minor = strconv.FormatInt(d.Minor, 10) + ) + if d.Major == Wildcard { + major = "*" + } + if d.Minor == Wildcard { + minor = "*" + } + return fmt.Sprintf("%c %s:%s %s", d.Type, major, minor, d.Permissions) } -// deviceNumberString converts the device number to a string return result. -func deviceNumberString(number int64) string { - if number == Wildcard { - return "*" +func (d *DeviceRule) Mkdev() (uint64, error) { + if d.Major == Wildcard || d.Minor == Wildcard { + return 0, errors.New("cannot mkdev() device with wildcards") } - return fmt.Sprint(number) + return unix.Mkdev(uint32(d.Major), uint32(d.Minor)), nil } diff --git a/libcontainer/configs/device_defaults.go b/libcontainer/configs/device_defaults.go deleted file mode 100644 index e4f423c523ff..000000000000 --- a/libcontainer/configs/device_defaults.go +++ /dev/null @@ -1,111 +0,0 @@ -// +build linux - -package configs - -var ( - // DefaultSimpleDevices are devices that are to be both allowed and created. - DefaultSimpleDevices = []*Device{ - // /dev/null and zero - { - Path: "/dev/null", - Type: 'c', - Major: 1, - Minor: 3, - Permissions: "rwm", - FileMode: 0666, - }, - { - Path: "/dev/zero", - Type: 'c', - Major: 1, - Minor: 5, - Permissions: "rwm", - FileMode: 0666, - }, - - { - Path: "/dev/full", - Type: 'c', - Major: 1, - Minor: 7, - Permissions: "rwm", - FileMode: 0666, - }, - - // consoles and ttys - { - Path: "/dev/tty", - Type: 'c', - Major: 5, - Minor: 0, - Permissions: "rwm", - FileMode: 0666, - }, - - // /dev/urandom,/dev/random - { - Path: "/dev/urandom", - Type: 'c', - Major: 1, - Minor: 9, - Permissions: "rwm", - FileMode: 0666, - }, - { - Path: "/dev/random", - Type: 'c', - Major: 1, - Minor: 8, - Permissions: "rwm", - FileMode: 0666, - }, - } - DefaultAllowedDevices = append([]*Device{ - // allow mknod for any device - { - Type: 'c', - Major: Wildcard, - Minor: Wildcard, - Permissions: "m", - }, - { - Type: 'b', - Major: Wildcard, - Minor: Wildcard, - Permissions: "m", - }, - - { - Path: "/dev/console", - Type: 'c', - Major: 5, - Minor: 1, - Permissions: "rwm", - }, - // /dev/pts/ - pts namespaces are "coming soon" - { - Path: "", - Type: 'c', - Major: 136, - Minor: Wildcard, - Permissions: "rwm", - }, - { - Path: "", - Type: 'c', - Major: 5, - Minor: 2, - Permissions: "rwm", - }, - - // tuntap - { - Path: "", - Type: 'c', - Major: 10, - Minor: 200, - Permissions: "rwm", - }, - }, DefaultSimpleDevices...) - DefaultAutoCreatedDevices = append([]*Device{}, DefaultSimpleDevices...) -) diff --git a/libcontainer/container_linux.go b/libcontainer/container_linux.go index fe70c937aad9..e59f29588164 100644 --- a/libcontainer/container_linux.go +++ b/libcontainer/container_linux.go @@ -1812,27 +1812,11 @@ func (c *linuxContainer) runType() (Status, error) { } func (c *linuxContainer) isPaused() (bool, error) { - fcg := c.cgroupManager.GetPaths()["freezer"] - if fcg == "" { - // A container doesn't have a freezer cgroup - return false, nil - } - pausedState := "FROZEN" - filename := "freezer.state" - if cgroups.IsCgroup2UnifiedMode() { - filename = "cgroup.freeze" - pausedState = "1" - } - - data, err := ioutil.ReadFile(filepath.Join(fcg, filename)) + state, err := c.cgroupManager.GetFreezerState() if err != nil { - // If freezer cgroup is not mounted, the container would just be not paused. - if os.IsNotExist(err) || err == syscall.ENODEV { - return false, nil - } - return false, newSystemErrorWithCause(err, "checking if container is paused") + return false, err } - return bytes.Equal(bytes.TrimSpace(data), []byte(pausedState)), nil + return state == configs.Frozen, nil } func (c *linuxContainer) currentState() (*State, error) { diff --git a/libcontainer/container_linux_test.go b/libcontainer/container_linux_test.go index f8af05d75fb0..2d972212ce02 100644 --- a/libcontainer/container_linux_test.go +++ b/libcontainer/container_linux_test.go @@ -61,10 +61,15 @@ func (m *mockCgroupManager) GetUnifiedPath() (string, error) { func (m *mockCgroupManager) Freeze(state configs.FreezerState) error { return nil } + func (m *mockCgroupManager) GetCgroups() (*configs.Cgroup, error) { return nil, nil } +func (m *mockCgroupManager) GetFreezerState() (configs.FreezerState, error) { + return configs.Thawed, nil +} + func (m *mockIntelRdtManager) Apply(pid int) error { return nil } diff --git a/libcontainer/devices/devices.go b/libcontainer/devices/devices.go index 5dabe06cef5b..702f913ec942 100644 --- a/libcontainer/devices/devices.go +++ b/libcontainer/devices/devices.go @@ -31,33 +31,33 @@ func DeviceFromPath(path, permissions string) (*configs.Device, error) { } var ( + devType configs.DeviceType + mode = stat.Mode devNumber = uint64(stat.Rdev) major = unix.Major(devNumber) minor = unix.Minor(devNumber) ) - if major == 0 { - return nil, ErrNotADevice - } - - var ( - devType rune - mode = stat.Mode - ) switch { case mode&unix.S_IFBLK == unix.S_IFBLK: - devType = 'b' + devType = configs.BlockDevice case mode&unix.S_IFCHR == unix.S_IFCHR: - devType = 'c' + devType = configs.CharDevice + case mode&unix.S_IFIFO == unix.S_IFIFO: + devType = configs.FifoDevice + default: + return nil, ErrNotADevice } return &configs.Device{ - Type: devType, - Path: path, - Major: int64(major), - Minor: int64(minor), - Permissions: permissions, - FileMode: os.FileMode(mode), - Uid: stat.Uid, - Gid: stat.Gid, + DeviceRule: configs.DeviceRule{ + Type: devType, + Major: int64(major), + Minor: int64(minor), + Permissions: configs.DevicePermissions(permissions), + }, + Path: path, + FileMode: os.FileMode(mode), + Uid: stat.Uid, + Gid: stat.Gid, }, nil } diff --git a/libcontainer/integration/template_test.go b/libcontainer/integration/template_test.go index 5f7cab53b67c..30503a98bb99 100644 --- a/libcontainer/integration/template_test.go +++ b/libcontainer/integration/template_test.go @@ -2,6 +2,7 @@ package integration import ( "github.com/opencontainers/runc/libcontainer/configs" + "github.com/opencontainers/runc/libcontainer/specconv" "golang.org/x/sys/unix" ) @@ -20,7 +21,10 @@ const defaultMountFlags = unix.MS_NOEXEC | unix.MS_NOSUID | unix.MS_NODEV // it uses a network strategy of just setting a loopback interface // and the default setup for devices func newTemplateConfig(rootfs string) *configs.Config { - allowAllDevices := false + var allowedDevices []*configs.DeviceRule + for _, device := range specconv.AllowedDevices { + allowedDevices = append(allowedDevices, &device.DeviceRule) + } return &configs.Config{ Rootfs: rootfs, Capabilities: &configs.Capabilities{ @@ -116,8 +120,7 @@ func newTemplateConfig(rootfs string) *configs.Config { Path: "integration/test", Resources: &configs.Resources{ MemorySwappiness: nil, - AllowAllDevices: &allowAllDevices, - AllowedDevices: configs.DefaultAllowedDevices, + Devices: allowedDevices, }, }, MaskPaths: []string{ @@ -127,7 +130,7 @@ func newTemplateConfig(rootfs string) *configs.Config { ReadonlyPaths: []string{ "/proc/sys", "/proc/sysrq-trigger", "/proc/irq", "/proc/bus", }, - Devices: configs.DefaultAutoCreatedDevices, + Devices: specconv.AllowedDevices, Hostname: "integration", Mounts: []*configs.Mount{ { diff --git a/libcontainer/rootfs_linux.go b/libcontainer/rootfs_linux.go index 106c4c2b98bf..4285eb8acaaa 100644 --- a/libcontainer/rootfs_linux.go +++ b/libcontainer/rootfs_linux.go @@ -624,11 +624,14 @@ func bindMountDeviceNode(dest string, node *configs.Device) error { // Creates the device node in the rootfs of the container. func createDeviceNode(rootfs string, node *configs.Device, bind bool) error { + if node.Path == "" { + // The node only exists for cgroup reasons, ignore it here. + return nil + } dest := filepath.Join(rootfs, node.Path) if err := os.MkdirAll(filepath.Dir(dest), 0755); err != nil { return err } - if bind { return bindMountDeviceNode(dest, node) } @@ -646,16 +649,20 @@ func createDeviceNode(rootfs string, node *configs.Device, bind bool) error { func mknodDevice(dest string, node *configs.Device) error { fileMode := node.FileMode switch node.Type { - case 'c', 'u': - fileMode |= unix.S_IFCHR - case 'b': + case configs.BlockDevice: fileMode |= unix.S_IFBLK - case 'p': + case configs.CharDevice: + fileMode |= unix.S_IFCHR + case configs.FifoDevice: fileMode |= unix.S_IFIFO default: return fmt.Errorf("%c is not a valid device type for device %s", node.Type, node.Path) } - if err := unix.Mknod(dest, uint32(fileMode), node.Mkdev()); err != nil { + dev, err := node.Mkdev() + if err != nil { + return err + } + if err := unix.Mknod(dest, uint32(fileMode), int(dev)); err != nil { return err } return unix.Chown(dest, int(node.Uid), int(node.Gid)) diff --git a/libcontainer/specconv/spec_linux.go b/libcontainer/specconv/spec_linux.go index d9e73c46be10..69bbc7c0b889 100644 --- a/libcontainer/specconv/spec_linux.go +++ b/libcontainer/specconv/spec_linux.go @@ -43,104 +43,147 @@ var mountPropagationMapping = map[string]int{ "": 0, } -// AllowedDevices is exposed for devicefilter_test.go +// AllowedDevices is the set of devices which are automatically included for +// all containers. +// +// XXX (cyphar) +// This behaviour is at the very least "questionable" (if not outright +// wrong) according to the runtime-spec. +// +// Yes, we have to include certain devices other than the ones the user +// specifies, but several devices listed here are not part of the spec +// (including "mknod for any device"?!). In addition, these rules are +// appended to the user-provided set which means that users *cannot disable +// this behaviour*. +// +// ... unfortunately I'm too scared to change this now because who knows how +// many people depend on this (incorrect and arguably insecure) behaviour. var AllowedDevices = []*configs.Device{ // allow mknod for any device { - Type: 'c', - Major: wildcard, - Minor: wildcard, - Permissions: "m", - Allow: true, - }, - { - Type: 'b', - Major: wildcard, - Minor: wildcard, - Permissions: "m", - Allow: true, + DeviceRule: configs.DeviceRule{ + Type: configs.CharDevice, + Major: configs.Wildcard, + Minor: configs.Wildcard, + Permissions: "m", + Allow: true, + }, }, { - Type: 'c', - Path: "/dev/null", - Major: 1, - Minor: 3, - Permissions: "rwm", - Allow: true, + DeviceRule: configs.DeviceRule{ + Type: configs.BlockDevice, + Major: configs.Wildcard, + Minor: configs.Wildcard, + Permissions: "m", + Allow: true, + }, }, { - Type: 'c', - Path: "/dev/random", - Major: 1, - Minor: 8, - Permissions: "rwm", - Allow: true, + Path: "/dev/null", + FileMode: 0666, + Uid: 0, + Gid: 0, + DeviceRule: configs.DeviceRule{ + Type: configs.CharDevice, + Major: 1, + Minor: 3, + Permissions: "rwm", + Allow: true, + }, }, { - Type: 'c', - Path: "/dev/full", - Major: 1, - Minor: 7, - Permissions: "rwm", - Allow: true, + Path: "/dev/random", + FileMode: 0666, + Uid: 0, + Gid: 0, + DeviceRule: configs.DeviceRule{ + Type: configs.CharDevice, + Major: 1, + Minor: 8, + Permissions: "rwm", + Allow: true, + }, }, { - Type: 'c', - Path: "/dev/tty", - Major: 5, - Minor: 0, - Permissions: "rwm", - Allow: true, + Path: "/dev/full", + FileMode: 0666, + Uid: 0, + Gid: 0, + DeviceRule: configs.DeviceRule{ + Type: configs.CharDevice, + Major: 1, + Minor: 7, + Permissions: "rwm", + Allow: true, + }, }, { - Type: 'c', - Path: "/dev/zero", - Major: 1, - Minor: 5, - Permissions: "rwm", - Allow: true, + Path: "/dev/tty", + FileMode: 0666, + Uid: 0, + Gid: 0, + DeviceRule: configs.DeviceRule{ + Type: configs.CharDevice, + Major: 5, + Minor: 0, + Permissions: "rwm", + Allow: true, + }, }, { - Type: 'c', - Path: "/dev/urandom", - Major: 1, - Minor: 9, - Permissions: "rwm", - Allow: true, + Path: "/dev/zero", + FileMode: 0666, + Uid: 0, + Gid: 0, + DeviceRule: configs.DeviceRule{ + Type: configs.CharDevice, + Major: 1, + Minor: 5, + Permissions: "rwm", + Allow: true, + }, }, { - Path: "/dev/console", - Type: 'c', - Major: 5, - Minor: 1, - Permissions: "rwm", - Allow: true, + Path: "/dev/urandom", + FileMode: 0666, + Uid: 0, + Gid: 0, + DeviceRule: configs.DeviceRule{ + Type: configs.CharDevice, + Major: 1, + Minor: 9, + Permissions: "rwm", + Allow: true, + }, }, // /dev/pts/ - pts namespaces are "coming soon" { - Path: "", - Type: 'c', - Major: 136, - Minor: wildcard, - Permissions: "rwm", - Allow: true, + DeviceRule: configs.DeviceRule{ + Type: configs.CharDevice, + Major: 136, + Minor: configs.Wildcard, + Permissions: "rwm", + Allow: true, + }, }, { - Path: "", - Type: 'c', - Major: 5, - Minor: 2, - Permissions: "rwm", - Allow: true, + DeviceRule: configs.DeviceRule{ + Type: configs.CharDevice, + Major: 5, + Minor: 2, + Permissions: "rwm", + Allow: true, + }, }, // tuntap { - Path: "", - Type: 'c', - Major: 10, - Minor: 200, - Permissions: "rwm", - Allow: true, + DeviceRule: configs.DeviceRule{ + Type: configs.CharDevice, + Major: 10, + Minor: 200, + Permissions: "rwm", + Allow: true, + }, }, } @@ -342,251 +385,198 @@ func CreateCgroupConfig(opts *CreateOpts) (*configs.Cgroup, error) { // In rootless containers, any attempt to make cgroup changes is likely to fail. // libcontainer will validate this but ignores the error. - c.Resources.AllowedDevices = AllowedDevices if spec.Linux != nil { r := spec.Linux.Resources - if r == nil { - return c, nil - } - for i, d := range spec.Linux.Resources.Devices { - var ( - t = "a" - major = int64(-1) - minor = int64(-1) - ) - if d.Type != "" { - t = d.Type - } - if d.Major != nil { - major = *d.Major - } - if d.Minor != nil { - minor = *d.Minor - } - if d.Access == "" { - return nil, fmt.Errorf("device access at %d field cannot be empty", i) - } - dt, err := stringToCgroupDeviceRune(t) - if err != nil { - return nil, err - } - dd := &configs.Device{ - Type: dt, - Major: major, - Minor: minor, - Permissions: d.Access, - Allow: d.Allow, - } - c.Resources.Devices = append(c.Resources.Devices, dd) - } - if r.Memory != nil { - if r.Memory.Limit != nil { - c.Resources.Memory = *r.Memory.Limit - } - if r.Memory.Reservation != nil { - c.Resources.MemoryReservation = *r.Memory.Reservation - } - if r.Memory.Swap != nil { - c.Resources.MemorySwap = *r.Memory.Swap - } - if r.Memory.Kernel != nil { - c.Resources.KernelMemory = *r.Memory.Kernel - } - if r.Memory.KernelTCP != nil { - c.Resources.KernelMemoryTCP = *r.Memory.KernelTCP - } - if r.Memory.Swappiness != nil { - c.Resources.MemorySwappiness = r.Memory.Swappiness - } - if r.Memory.DisableOOMKiller != nil { - c.Resources.OomKillDisable = *r.Memory.DisableOOMKiller - } - } - if r.CPU != nil { - if r.CPU.Shares != nil { - c.Resources.CpuShares = *r.CPU.Shares - } - if r.CPU.Quota != nil { - c.Resources.CpuQuota = *r.CPU.Quota - } - if r.CPU.Period != nil { - c.Resources.CpuPeriod = *r.CPU.Period - } - if r.CPU.RealtimeRuntime != nil { - c.Resources.CpuRtRuntime = *r.CPU.RealtimeRuntime - } - if r.CPU.RealtimePeriod != nil { - c.Resources.CpuRtPeriod = *r.CPU.RealtimePeriod - } - if r.CPU.Cpus != "" { - c.Resources.CpusetCpus = r.CPU.Cpus + if r != nil { + for i, d := range spec.Linux.Resources.Devices { + var ( + t = "a" + major = int64(-1) + minor = int64(-1) + ) + if d.Type != "" { + t = d.Type + } + if d.Major != nil { + major = *d.Major + } + if d.Minor != nil { + minor = *d.Minor + } + if d.Access == "" { + return nil, fmt.Errorf("device access at %d field cannot be empty", i) + } + dt, err := stringToCgroupDeviceRune(t) + if err != nil { + return nil, err + } + c.Resources.Devices = append(c.Resources.Devices, &configs.DeviceRule{ + Type: dt, + Major: major, + Minor: minor, + Permissions: configs.DevicePermissions(d.Access), + Allow: d.Allow, + }) } - if r.CPU.Mems != "" { - c.Resources.CpusetMems = r.CPU.Mems + if r.Memory != nil { + if r.Memory.Limit != nil { + c.Resources.Memory = *r.Memory.Limit + } + if r.Memory.Reservation != nil { + c.Resources.MemoryReservation = *r.Memory.Reservation + } + if r.Memory.Swap != nil { + c.Resources.MemorySwap = *r.Memory.Swap + } + if r.Memory.Kernel != nil { + c.Resources.KernelMemory = *r.Memory.Kernel + } + if r.Memory.KernelTCP != nil { + c.Resources.KernelMemoryTCP = *r.Memory.KernelTCP + } + if r.Memory.Swappiness != nil { + c.Resources.MemorySwappiness = r.Memory.Swappiness + } + if r.Memory.DisableOOMKiller != nil { + c.Resources.OomKillDisable = *r.Memory.DisableOOMKiller + } } - } - if r.Pids != nil { - c.Resources.PidsLimit = r.Pids.Limit - } - if r.BlockIO != nil { - if r.BlockIO.Weight != nil { - c.Resources.BlkioWeight = *r.BlockIO.Weight + if r.CPU != nil { + if r.CPU.Shares != nil { + c.Resources.CpuShares = *r.CPU.Shares + } + if r.CPU.Quota != nil { + c.Resources.CpuQuota = *r.CPU.Quota + } + if r.CPU.Period != nil { + c.Resources.CpuPeriod = *r.CPU.Period + } + if r.CPU.RealtimeRuntime != nil { + c.Resources.CpuRtRuntime = *r.CPU.RealtimeRuntime + } + if r.CPU.RealtimePeriod != nil { + c.Resources.CpuRtPeriod = *r.CPU.RealtimePeriod + } + if r.CPU.Cpus != "" { + c.Resources.CpusetCpus = r.CPU.Cpus + } + if r.CPU.Mems != "" { + c.Resources.CpusetMems = r.CPU.Mems + } } - if r.BlockIO.LeafWeight != nil { - c.Resources.BlkioLeafWeight = *r.BlockIO.LeafWeight + if r.Pids != nil { + c.Resources.PidsLimit = r.Pids.Limit } - if r.BlockIO.WeightDevice != nil { - for _, wd := range r.BlockIO.WeightDevice { - var weight, leafWeight uint16 - if wd.Weight != nil { - weight = *wd.Weight - } - if wd.LeafWeight != nil { - leafWeight = *wd.LeafWeight + if r.BlockIO != nil { + if r.BlockIO.Weight != nil { + c.Resources.BlkioWeight = *r.BlockIO.Weight + } + if r.BlockIO.LeafWeight != nil { + c.Resources.BlkioLeafWeight = *r.BlockIO.LeafWeight + } + if r.BlockIO.WeightDevice != nil { + for _, wd := range r.BlockIO.WeightDevice { + var weight, leafWeight uint16 + if wd.Weight != nil { + weight = *wd.Weight + } + if wd.LeafWeight != nil { + leafWeight = *wd.LeafWeight + } + weightDevice := configs.NewWeightDevice(wd.Major, wd.Minor, weight, leafWeight) + c.Resources.BlkioWeightDevice = append(c.Resources.BlkioWeightDevice, weightDevice) } - weightDevice := configs.NewWeightDevice(wd.Major, wd.Minor, weight, leafWeight) - c.Resources.BlkioWeightDevice = append(c.Resources.BlkioWeightDevice, weightDevice) } - } - if r.BlockIO.ThrottleReadBpsDevice != nil { - for _, td := range r.BlockIO.ThrottleReadBpsDevice { - rate := td.Rate - throttleDevice := configs.NewThrottleDevice(td.Major, td.Minor, rate) - c.Resources.BlkioThrottleReadBpsDevice = append(c.Resources.BlkioThrottleReadBpsDevice, throttleDevice) + if r.BlockIO.ThrottleReadBpsDevice != nil { + for _, td := range r.BlockIO.ThrottleReadBpsDevice { + rate := td.Rate + throttleDevice := configs.NewThrottleDevice(td.Major, td.Minor, rate) + c.Resources.BlkioThrottleReadBpsDevice = append(c.Resources.BlkioThrottleReadBpsDevice, throttleDevice) + } } - } - if r.BlockIO.ThrottleWriteBpsDevice != nil { - for _, td := range r.BlockIO.ThrottleWriteBpsDevice { - rate := td.Rate - throttleDevice := configs.NewThrottleDevice(td.Major, td.Minor, rate) - c.Resources.BlkioThrottleWriteBpsDevice = append(c.Resources.BlkioThrottleWriteBpsDevice, throttleDevice) + if r.BlockIO.ThrottleWriteBpsDevice != nil { + for _, td := range r.BlockIO.ThrottleWriteBpsDevice { + rate := td.Rate + throttleDevice := configs.NewThrottleDevice(td.Major, td.Minor, rate) + c.Resources.BlkioThrottleWriteBpsDevice = append(c.Resources.BlkioThrottleWriteBpsDevice, throttleDevice) + } } - } - if r.BlockIO.ThrottleReadIOPSDevice != nil { - for _, td := range r.BlockIO.ThrottleReadIOPSDevice { - rate := td.Rate - throttleDevice := configs.NewThrottleDevice(td.Major, td.Minor, rate) - c.Resources.BlkioThrottleReadIOPSDevice = append(c.Resources.BlkioThrottleReadIOPSDevice, throttleDevice) + if r.BlockIO.ThrottleReadIOPSDevice != nil { + for _, td := range r.BlockIO.ThrottleReadIOPSDevice { + rate := td.Rate + throttleDevice := configs.NewThrottleDevice(td.Major, td.Minor, rate) + c.Resources.BlkioThrottleReadIOPSDevice = append(c.Resources.BlkioThrottleReadIOPSDevice, throttleDevice) + } } - } - if r.BlockIO.ThrottleWriteIOPSDevice != nil { - for _, td := range r.BlockIO.ThrottleWriteIOPSDevice { - rate := td.Rate - throttleDevice := configs.NewThrottleDevice(td.Major, td.Minor, rate) - c.Resources.BlkioThrottleWriteIOPSDevice = append(c.Resources.BlkioThrottleWriteIOPSDevice, throttleDevice) + if r.BlockIO.ThrottleWriteIOPSDevice != nil { + for _, td := range r.BlockIO.ThrottleWriteIOPSDevice { + rate := td.Rate + throttleDevice := configs.NewThrottleDevice(td.Major, td.Minor, rate) + c.Resources.BlkioThrottleWriteIOPSDevice = append(c.Resources.BlkioThrottleWriteIOPSDevice, throttleDevice) + } } } - } - for _, l := range r.HugepageLimits { - c.Resources.HugetlbLimit = append(c.Resources.HugetlbLimit, &configs.HugepageLimit{ - Pagesize: l.Pagesize, - Limit: l.Limit, - }) - } - if r.Network != nil { - if r.Network.ClassID != nil { - c.Resources.NetClsClassid = *r.Network.ClassID - } - for _, m := range r.Network.Priorities { - c.Resources.NetPrioIfpriomap = append(c.Resources.NetPrioIfpriomap, &configs.IfPrioMap{ - Interface: m.Name, - Priority: int64(m.Priority), + for _, l := range r.HugepageLimits { + c.Resources.HugetlbLimit = append(c.Resources.HugetlbLimit, &configs.HugepageLimit{ + Pagesize: l.Pagesize, + Limit: l.Limit, }) } + if r.Network != nil { + if r.Network.ClassID != nil { + c.Resources.NetClsClassid = *r.Network.ClassID + } + for _, m := range r.Network.Priorities { + c.Resources.NetPrioIfpriomap = append(c.Resources.NetPrioIfpriomap, &configs.IfPrioMap{ + Interface: m.Name, + Priority: int64(m.Priority), + }) + } + } } } - // append the default allowed devices to the end of the list - c.Resources.Devices = append(c.Resources.Devices, AllowedDevices...) + // Append the default allowed devices to the end of the list. + // XXX: Really this should be prefixed... + for _, device := range AllowedDevices { + c.Resources.Devices = append(c.Resources.Devices, &device.DeviceRule) + } return c, nil } -func stringToCgroupDeviceRune(s string) (rune, error) { +func stringToCgroupDeviceRune(s string) (configs.DeviceType, error) { switch s { case "a": - return 'a', nil + return configs.WildcardDevice, nil case "b": - return 'b', nil + return configs.BlockDevice, nil case "c": - return 'c', nil + return configs.CharDevice, nil default: return 0, fmt.Errorf("invalid cgroup device type %q", s) } } -func stringToDeviceRune(s string) (rune, error) { +func stringToDeviceRune(s string) (configs.DeviceType, error) { switch s { case "p": - return 'p', nil - case "u": - return 'u', nil + return configs.FifoDevice, nil + case "u", "c": + return configs.CharDevice, nil case "b": - return 'b', nil - case "c": - return 'c', nil + return configs.BlockDevice, nil default: return 0, fmt.Errorf("invalid device type %q", s) } } func createDevices(spec *specs.Spec, config *configs.Config) error { - // add whitelisted devices - config.Devices = []*configs.Device{ - { - Type: 'c', - Path: "/dev/null", - Major: 1, - Minor: 3, - FileMode: 0666, - Uid: 0, - Gid: 0, - }, - { - Type: 'c', - Path: "/dev/random", - Major: 1, - Minor: 8, - FileMode: 0666, - Uid: 0, - Gid: 0, - }, - { - Type: 'c', - Path: "/dev/full", - Major: 1, - Minor: 7, - FileMode: 0666, - Uid: 0, - Gid: 0, - }, - { - Type: 'c', - Path: "/dev/tty", - Major: 5, - Minor: 0, - FileMode: 0666, - Uid: 0, - Gid: 0, - }, - { - Type: 'c', - Path: "/dev/zero", - Major: 1, - Minor: 5, - FileMode: 0666, - Uid: 0, - Gid: 0, - }, - { - Type: 'c', - Path: "/dev/urandom", - Major: 1, - Minor: 9, - FileMode: 0666, - Uid: 0, - Gid: 0, - }, + // Add default set of devices. + for _, device := range AllowedDevices { + if device.Path != "" { + config.Devices = append(config.Devices, device) + } } - // merge in additional devices from the spec + // Merge in additional devices from the spec. if spec.Linux != nil { for _, d := range spec.Linux.Devices { var uid, gid uint32 @@ -606,10 +596,12 @@ func createDevices(spec *specs.Spec, config *configs.Config) error { filemode = *d.FileMode } device := &configs.Device{ - Type: dt, + DeviceRule: configs.DeviceRule{ + Type: dt, + Major: d.Major, + Minor: d.Minor, + }, Path: d.Path, - Major: d.Major, - Minor: d.Minor, FileMode: filemode, Uid: uid, Gid: gid, -- 2.26.2
Locations
Projects
Search
Status Monitor
Help
OpenBuildService.org
Documentation
API Documentation
Code of Conduct
Contact
Support
@OBShq
Terms
openSUSE Build Service is sponsored by
The Open Build Service is an
openSUSE project
.
Sign Up
Log In
Places
Places
All Projects
Status Monitor