| |
@@ -0,0 +1,189 @@
|
| |
+ From 567c01f6d157cf6c1f39d68e9ca62e76d7834558 Mon Sep 17 00:00:00 2001
|
| |
+ From: Tianon Gravi <admwiggin@gmail.com>
|
| |
+ Date: Thu, 9 Sep 2021 11:31:30 -0700
|
| |
+ Subject: [PATCH] seccomp: add support for "clone3" syscall in default policy
|
| |
+ MIME-Version: 1.0
|
| |
+ Content-Type: text/plain; charset=UTF-8
|
| |
+ Content-Transfer-Encoding: 8bit
|
| |
+
|
| |
+ This is a backport of 9f6b562dd12ef7b1f9e2f8e6f2ab6477790a6594, adapted to avoid the refactoring that happened in d92739713c633c155c0f3d8065c8278b1d8a44e7.
|
| |
+
|
| |
+ Original commit message is as follows:
|
| |
+
|
| |
+ > If no seccomp policy is requested, then the built-in default policy in
|
| |
+ > dockerd applies. This has no rule for "clone3" defined, nor any default
|
| |
+ > errno defined. So when runc receives the config it attempts to determine
|
| |
+ > a default errno, using logic defined in its commit:
|
| |
+ >
|
| |
+ > opencontainers/runc@7a8d716
|
| |
+ >
|
| |
+ > As explained in the above commit message, runc uses a heuristic to
|
| |
+ > decide which errno to return by default:
|
| |
+ >
|
| |
+ > [quote]
|
| |
+ > The solution applied here is to prepend a "stub" filter which returns
|
| |
+ > -ENOSYS if the requested syscall has a larger syscall number than any
|
| |
+ > syscall mentioned in the filter. The reason for this specific rule is
|
| |
+ > that syscall numbers are (roughly) allocated sequentially and thus newer
|
| |
+ > syscalls will (usually) have a larger syscall number -- thus causing our
|
| |
+ > filters to produce -ENOSYS if the filter was written before the syscall
|
| |
+ > existed.
|
| |
+ > [/quote]
|
| |
+ >
|
| |
+ > Unfortunately clone3 appears to one of the edge cases that does not
|
| |
+ > result in use of ENOSYS, instead ending up with the historical EPERM
|
| |
+ > errno.
|
| |
+ >
|
| |
+ > Latest glibc (2.33.9000, in Fedora 35 rawhide) will attempt to use
|
| |
+ > clone3 by default. If it sees ENOSYS then it will automatically
|
| |
+ > fallback to using clone. Any other errno is treated as a fatal
|
| |
+ > error. Thus when docker seccomp policy triggers EPERM from clone3,
|
| |
+ > no fallback occurs and programs are thus unable to spawn threads.
|
| |
+ >
|
| |
+ > The clone3 syscall is much more complicated than clone, most notably its
|
| |
+ > flags are not exposed as a directly argument any more. Instead they are
|
| |
+ > hidden inside a struct. This means that seccomp filters are unable to
|
| |
+ > apply policy based on values seen in flags. Thus we can't directly
|
| |
+ > replicate the current "clone" filtering for "clone3". We can at least
|
| |
+ > ensure "clone3" returns ENOSYS errno, to trigger fallback to "clone"
|
| |
+ > at which point we can filter on flags.
|
| |
+
|
| |
+ Signed-off-by: Tianon Gravi <admwiggin@gmail.com>
|
| |
+ Co-authored-by: Daniel P. Berrangé <berrange@redhat.com>
|
| |
+ ---
|
| |
+ profiles/seccomp/default.json | 16 ++++++++++++++++
|
| |
+ profiles/seccomp/default_linux.go | 13 +++++++++++++
|
| |
+ profiles/seccomp/seccomp.go | 1 +
|
| |
+ profiles/seccomp/seccomp_linux.go | 28 ++++++++++++----------------
|
| |
+ 4 files changed, 42 insertions(+), 16 deletions(-)
|
| |
+
|
| |
+ diff --git a/profiles/seccomp/default.json b/profiles/seccomp/default.json
|
| |
+ index 4213799ddb5..ee5e04f781a 100644
|
| |
+ --- a/profiles/seccomp/default.json
|
| |
+ +++ b/profiles/seccomp/default.json
|
| |
+ @@ -591,6 +591,7 @@
|
| |
+ "names": [
|
| |
+ "bpf",
|
| |
+ "clone",
|
| |
+ + "clone3",
|
| |
+ "fanotify_init",
|
| |
+ "fsconfig",
|
| |
+ "fsmount",
|
| |
+ @@ -670,6 +671,21 @@
|
| |
+ ]
|
| |
+ }
|
| |
+ },
|
| |
+ + {
|
| |
+ + "names": [
|
| |
+ + "clone3"
|
| |
+ + ],
|
| |
+ + "action": "SCMP_ACT_ERRNO",
|
| |
+ + "errnoRet": 38,
|
| |
+ + "args": [],
|
| |
+ + "comment": "",
|
| |
+ + "includes": {},
|
| |
+ + "excludes": {
|
| |
+ + "caps": [
|
| |
+ + "CAP_SYS_ADMIN"
|
| |
+ + ]
|
| |
+ + }
|
| |
+ + },
|
| |
+ {
|
| |
+ "names": [
|
| |
+ "reboot"
|
| |
+ diff --git a/profiles/seccomp/default_linux.go b/profiles/seccomp/default_linux.go
|
| |
+ index 879eb88c64f..fb593f336f7 100644
|
| |
+ --- a/profiles/seccomp/default_linux.go
|
| |
+ +++ b/profiles/seccomp/default_linux.go
|
| |
+ @@ -42,6 +42,7 @@ func arches() []Architecture {
|
| |
+
|
| |
+ // DefaultProfile defines the allowed syscalls for the default seccomp profile.
|
| |
+ func DefaultProfile() *Seccomp {
|
| |
+ + nosys := uint(unix.ENOSYS)
|
| |
+ syscalls := []*Syscall{
|
| |
+ {
|
| |
+ Names: []string{
|
| |
+ @@ -522,6 +523,7 @@ func DefaultProfile() *Seccomp {
|
| |
+ Names: []string{
|
| |
+ "bpf",
|
| |
+ "clone",
|
| |
+ + "clone3",
|
| |
+ "fanotify_init",
|
| |
+ "fsconfig",
|
| |
+ "fsmount",
|
| |
+ @@ -587,6 +589,17 @@ func DefaultProfile() *Seccomp {
|
| |
+ Caps: []string{"CAP_SYS_ADMIN"},
|
| |
+ },
|
| |
+ },
|
| |
+ + {
|
| |
+ + Names: []string{
|
| |
+ + "clone3",
|
| |
+ + },
|
| |
+ + Action: specs.ActErrno,
|
| |
+ + ErrnoRet: &nosys,
|
| |
+ + Args: []*specs.LinuxSeccompArg{},
|
| |
+ + Excludes: Filter{
|
| |
+ + Caps: []string{"CAP_SYS_ADMIN"},
|
| |
+ + },
|
| |
+ + },
|
| |
+ {
|
| |
+ Names: []string{
|
| |
+ "reboot",
|
| |
+ diff --git a/profiles/seccomp/seccomp.go b/profiles/seccomp/seccomp.go
|
| |
+ index d2a21cddc4b..9edec72db54 100644
|
| |
+ --- a/profiles/seccomp/seccomp.go
|
| |
+ +++ b/profiles/seccomp/seccomp.go
|
| |
+ @@ -45,6 +45,7 @@ type Syscall struct {
|
| |
+ Name string `json:"name,omitempty"`
|
| |
+ Names []string `json:"names,omitempty"`
|
| |
+ Action specs.LinuxSeccompAction `json:"action"`
|
| |
+ + ErrnoRet *uint `json:"errnoRet,omitempty"`
|
| |
+ Args []*specs.LinuxSeccompArg `json:"args"`
|
| |
+ Comment string `json:"comment"`
|
| |
+ Includes Filter `json:"includes"`
|
| |
+ diff --git a/profiles/seccomp/seccomp_linux.go b/profiles/seccomp/seccomp_linux.go
|
| |
+ index 566f173acd3..e35e242cd50 100644
|
| |
+ --- a/profiles/seccomp/seccomp_linux.go
|
| |
+ +++ b/profiles/seccomp/seccomp_linux.go
|
| |
+ @@ -150,29 +150,25 @@ Loop:
|
| |
+ }
|
| |
+ }
|
| |
+
|
| |
+ + newCall := specs.LinuxSyscall{
|
| |
+ + Action: call.Action,
|
| |
+ + ErrnoRet: call.ErrnoRet,
|
| |
+ + }
|
| |
+ if call.Name != "" && len(call.Names) != 0 {
|
| |
+ return nil, errors.New("'name' and 'names' were specified in the seccomp profile, use either 'name' or 'names'")
|
| |
+ }
|
| |
+ -
|
| |
+ if call.Name != "" {
|
| |
+ - newConfig.Syscalls = append(newConfig.Syscalls, createSpecsSyscall([]string{call.Name}, call.Action, call.Args))
|
| |
+ + newCall.Names = []string{call.Name}
|
| |
+ } else {
|
| |
+ - newConfig.Syscalls = append(newConfig.Syscalls, createSpecsSyscall(call.Names, call.Action, call.Args))
|
| |
+ + newCall.Names = call.Names
|
| |
+ + }
|
| |
+ + // Loop through all the arguments of the syscall and convert them
|
| |
+ + for _, arg := range call.Args {
|
| |
+ + newCall.Args = append(newCall.Args, *arg)
|
| |
+ }
|
| |
+ - }
|
| |
+ -
|
| |
+ - return newConfig, nil
|
| |
+ -}
|
| |
+
|
| |
+ -func createSpecsSyscall(names []string, action specs.LinuxSeccompAction, args []*specs.LinuxSeccompArg) specs.LinuxSyscall {
|
| |
+ - newCall := specs.LinuxSyscall{
|
| |
+ - Names: names,
|
| |
+ - Action: action,
|
| |
+ + newConfig.Syscalls = append(newConfig.Syscalls, newCall)
|
| |
+ }
|
| |
+
|
| |
+ - // Loop through all the arguments of the syscall and convert them
|
| |
+ - for _, arg := range args {
|
| |
+ - newCall.Args = append(newCall.Args, *arg)
|
| |
+ - }
|
| |
+ - return newCall
|
| |
+ + return newConfig, nil
|
| |
+ }
|
| |
Hi @olem,
In this PR, I updated Moby to the latest version and applied a patch to fix the clone3() issue. I changed the
commit_moby
andcommit_cli
variables accordingly. Please see my comments in the specfile and the linked Bugzilla tickets for more information about the clone3() issue. I checked the virtual provides in the# Bundled dependencies
section, but there was nothing to change, as thevendor.conf
files have not changed since the last release.I think you have to run
fedpkg new-sources
before pushing the update; I can't, because I'm not a member of the packager group. That is also why the CI job fails.This change is also relevant for
f35
andf34
. Can you please apply it to those branches, as well? I already rebased this patch onto thef35
andf34
branches on my fork, so I can create PRs for those branches if that would be helpful.Thanks,
Maxwell