Bugzilla: 1129669 Upstream-status: 3.17 and CC'd to stable From a6138db815df5ee542d848318e5dae681590fccd Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Mon, 28 Jul 2014 16:26:53 -0700 Subject: [PATCH 1/5] mnt: Only change user settable mount flags in remount Kenton Varda discovered that by remounting a read-only bind mount read-only in a user namespace the MNT_LOCK_READONLY bit would be cleared, allowing an unprivileged user to the remount a read-only mount read-write. Correct this by replacing the mask of mount flags to preserve with a mask of mount flags that may be changed, and preserve all others. This ensures that any future bugs with this mask and remount will fail in an easy to detect way where new mount flags simply won't change. Cc: stable@vger.kernel.org Acked-by: Serge E. Hallyn Signed-off-by: "Eric W. Biederman" --- fs/namespace.c | 2 +- include/linux/mount.h | 4 +++- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/fs/namespace.c b/fs/namespace.c index 7187d01329c3..cb40449ea0df 100644 --- a/fs/namespace.c +++ b/fs/namespace.c @@ -1937,7 +1937,7 @@ static int do_remount(struct path *path, int flags, int mnt_flags, err = do_remount_sb(sb, flags, data, 0); if (!err) { lock_mount_hash(); - mnt_flags |= mnt->mnt.mnt_flags & MNT_PROPAGATION_MASK; + mnt_flags |= mnt->mnt.mnt_flags & ~MNT_USER_SETTABLE_MASK; mnt->mnt.mnt_flags = mnt_flags; touch_mnt_namespace(mnt->mnt_ns); unlock_mount_hash(); diff --git a/include/linux/mount.h b/include/linux/mount.h index 839bac270904..b637a89e1fae 100644 --- a/include/linux/mount.h +++ b/include/linux/mount.h @@ -42,7 +42,9 @@ struct mnt_namespace; * flag, consider how it interacts with shared mounts. */ #define MNT_SHARED_MASK (MNT_UNBINDABLE) -#define MNT_PROPAGATION_MASK (MNT_SHARED | MNT_UNBINDABLE) +#define MNT_USER_SETTABLE_MASK (MNT_NOSUID | MNT_NODEV | MNT_NOEXEC \ + | MNT_NOATIME | MNT_NODIRATIME | MNT_RELATIME \ + | MNT_READONLY) #define MNT_INTERNAL_FLAGS (MNT_SHARED | MNT_WRITE_HOLD | MNT_INTERNAL | \ MNT_DOOMED | MNT_SYNC_UMOUNT | MNT_MARKED) -- 2.0.4 From 07b645589dcda8b7a5249e096fece2a67556f0f4 Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Mon, 28 Jul 2014 17:10:56 -0700 Subject: [PATCH 2/5] mnt: Move the test for MNT_LOCK_READONLY from change_mount_flags into do_remount There are no races as locked mount flags are guaranteed to never change. Moving the test into do_remount makes it more visible, and ensures all filesystem remounts pass the MNT_LOCK_READONLY permission check. This second case is not an issue today as filesystem remounts are guarded by capable(CAP_DAC_ADMIN) and thus will always fail in less privileged mount namespaces, but it could become an issue in the future. Cc: stable@vger.kernel.org Acked-by: Serge E. Hallyn Signed-off-by: "Eric W. Biederman" --- fs/namespace.c | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) diff --git a/fs/namespace.c b/fs/namespace.c index cb40449ea0df..1105a577a14f 100644 --- a/fs/namespace.c +++ b/fs/namespace.c @@ -1896,9 +1896,6 @@ static int change_mount_flags(struct vfsmount *mnt, int ms_flags) if (readonly_request == __mnt_is_readonly(mnt)) return 0; - if (mnt->mnt_flags & MNT_LOCK_READONLY) - return -EPERM; - if (readonly_request) error = mnt_make_readonly(real_mount(mnt)); else @@ -1924,6 +1921,16 @@ static int do_remount(struct path *path, int flags, int mnt_flags, if (path->dentry != path->mnt->mnt_root) return -EINVAL; + /* Don't allow changing of locked mnt flags. + * + * No locks need to be held here while testing the various + * MNT_LOCK flags because those flags can never be cleared + * once they are set. + */ + if ((mnt->mnt.mnt_flags & MNT_LOCK_READONLY) && + !(mnt_flags & MNT_READONLY)) { + return -EPERM; + } err = security_sb_remount(sb, data); if (err) return err; -- 2.0.4 From 9566d6742852c527bf5af38af5cbb878dad75705 Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Mon, 28 Jul 2014 17:26:07 -0700 Subject: [PATCH 3/5] mnt: Correct permission checks in do_remount While invesgiating the issue where in "mount --bind -oremount,ro ..." would result in later "mount --bind -oremount,rw" succeeding even if the mount started off locked I realized that there are several additional mount flags that should be locked and are not. In particular MNT_NOSUID, MNT_NODEV, MNT_NOEXEC, and the atime flags in addition to MNT_READONLY should all be locked. These flags are all per superblock, can all be changed with MS_BIND, and should not be changable if set by a more privileged user. The following additions to the current logic are added in this patch. - nosuid may not be clearable by a less privileged user. - nodev may not be clearable by a less privielged user. - noexec may not be clearable by a less privileged user. - atime flags may not be changeable by a less privileged user. The logic with atime is that always setting atime on access is a global policy and backup software and auditing software could break if atime bits are not updated (when they are configured to be updated), and serious performance degradation could result (DOS attack) if atime updates happen when they have been explicitly disabled. Therefore an unprivileged user should not be able to mess with the atime bits set by a more privileged user. The additional restrictions are implemented with the addition of MNT_LOCK_NOSUID, MNT_LOCK_NODEV, MNT_LOCK_NOEXEC, and MNT_LOCK_ATIME mnt flags. Taken together these changes and the fixes for MNT_LOCK_READONLY should make it safe for an unprivileged user to create a user namespace and to call "mount --bind -o remount,... ..." without the danger of mount flags being changed maliciously. Cc: stable@vger.kernel.org Acked-by: Serge E. Hallyn Signed-off-by: "Eric W. Biederman" --- fs/namespace.c | 36 +++++++++++++++++++++++++++++++++--- include/linux/mount.h | 5 +++++ 2 files changed, 38 insertions(+), 3 deletions(-) diff --git a/fs/namespace.c b/fs/namespace.c index 1105a577a14f..dd9c93b5a9d5 100644 --- a/fs/namespace.c +++ b/fs/namespace.c @@ -890,8 +890,21 @@ static struct mount *clone_mnt(struct mount *old, struct dentry *root, mnt->mnt.mnt_flags = old->mnt.mnt_flags & ~(MNT_WRITE_HOLD|MNT_MARKED); /* Don't allow unprivileged users to change mount flags */ - if ((flag & CL_UNPRIVILEGED) && (mnt->mnt.mnt_flags & MNT_READONLY)) - mnt->mnt.mnt_flags |= MNT_LOCK_READONLY; + if (flag & CL_UNPRIVILEGED) { + mnt->mnt.mnt_flags |= MNT_LOCK_ATIME; + + if (mnt->mnt.mnt_flags & MNT_READONLY) + mnt->mnt.mnt_flags |= MNT_LOCK_READONLY; + + if (mnt->mnt.mnt_flags & MNT_NODEV) + mnt->mnt.mnt_flags |= MNT_LOCK_NODEV; + + if (mnt->mnt.mnt_flags & MNT_NOSUID) + mnt->mnt.mnt_flags |= MNT_LOCK_NOSUID; + + if (mnt->mnt.mnt_flags & MNT_NOEXEC) + mnt->mnt.mnt_flags |= MNT_LOCK_NOEXEC; + } /* Don't allow unprivileged users to reveal what is under a mount */ if ((flag & CL_UNPRIVILEGED) && list_empty(&old->mnt_expire)) @@ -1931,6 +1944,23 @@ static int do_remount(struct path *path, int flags, int mnt_flags, !(mnt_flags & MNT_READONLY)) { return -EPERM; } + if ((mnt->mnt.mnt_flags & MNT_LOCK_NODEV) && + !(mnt_flags & MNT_NODEV)) { + return -EPERM; + } + if ((mnt->mnt.mnt_flags & MNT_LOCK_NOSUID) && + !(mnt_flags & MNT_NOSUID)) { + return -EPERM; + } + if ((mnt->mnt.mnt_flags & MNT_LOCK_NOEXEC) && + !(mnt_flags & MNT_NOEXEC)) { + return -EPERM; + } + if ((mnt->mnt.mnt_flags & MNT_LOCK_ATIME) && + ((mnt->mnt.mnt_flags & MNT_ATIME_MASK) != (mnt_flags & MNT_ATIME_MASK))) { + return -EPERM; + } + err = security_sb_remount(sb, data); if (err) return err; @@ -2129,7 +2159,7 @@ static int do_new_mount(struct path *path, const char *fstype, int flags, */ if (!(type->fs_flags & FS_USERNS_DEV_MOUNT)) { flags |= MS_NODEV; - mnt_flags |= MNT_NODEV; + mnt_flags |= MNT_NODEV | MNT_LOCK_NODEV; } } diff --git a/include/linux/mount.h b/include/linux/mount.h index b637a89e1fae..b0c1e6574e7f 100644 --- a/include/linux/mount.h +++ b/include/linux/mount.h @@ -45,12 +45,17 @@ struct mnt_namespace; #define MNT_USER_SETTABLE_MASK (MNT_NOSUID | MNT_NODEV | MNT_NOEXEC \ | MNT_NOATIME | MNT_NODIRATIME | MNT_RELATIME \ | MNT_READONLY) +#define MNT_ATIME_MASK (MNT_NOATIME | MNT_NODIRATIME | MNT_RELATIME ) #define MNT_INTERNAL_FLAGS (MNT_SHARED | MNT_WRITE_HOLD | MNT_INTERNAL | \ MNT_DOOMED | MNT_SYNC_UMOUNT | MNT_MARKED) #define MNT_INTERNAL 0x4000 +#define MNT_LOCK_ATIME 0x040000 +#define MNT_LOCK_NOEXEC 0x080000 +#define MNT_LOCK_NOSUID 0x100000 +#define MNT_LOCK_NODEV 0x200000 #define MNT_LOCK_READONLY 0x400000 #define MNT_LOCKED 0x800000 #define MNT_DOOMED 0x1000000 -- 2.0.4 From ffbc6f0ead47fa5a1dc9642b0331cb75c20a640e Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Mon, 28 Jul 2014 17:36:04 -0700 Subject: [PATCH 4/5] mnt: Change the default remount atime from relatime to the existing value Since March 2009 the kernel has treated the state that if no MS_..ATIME flags are passed then the kernel defaults to relatime. Defaulting to relatime instead of the existing atime state during a remount is silly, and causes problems in practice for people who don't specify any MS_...ATIME flags and to get the default filesystem atime setting. Those users may encounter a permission error because the default atime setting does not work. A default that does not work and causes permission problems is ridiculous, so preserve the existing value to have a default atime setting that is always guaranteed to work. Using the default atime setting in this way is particularly interesting for applications built to run in restricted userspace environments without /proc mounted, as the existing atime mount options of a filesystem can not be read from /proc/mounts. In practice this fixes user space that uses the default atime setting on remount that are broken by the permission checks keeping less privileged users from changing more privileged users atime settings. Cc: stable@vger.kernel.org Acked-by: Serge E. Hallyn Signed-off-by: "Eric W. Biederman" --- fs/namespace.c | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/fs/namespace.c b/fs/namespace.c index dd9c93b5a9d5..7886176232c1 100644 --- a/fs/namespace.c +++ b/fs/namespace.c @@ -2473,6 +2473,14 @@ long do_mount(const char *dev_name, const char *dir_name, if (flags & MS_RDONLY) mnt_flags |= MNT_READONLY; + /* The default atime for remount is preservation */ + if ((flags & MS_REMOUNT) && + ((flags & (MS_NOATIME | MS_NODIRATIME | MS_RELATIME | + MS_STRICTATIME)) == 0)) { + mnt_flags &= ~MNT_ATIME_MASK; + mnt_flags |= path.mnt->mnt_flags & MNT_ATIME_MASK; + } + flags &= ~(MS_NOSUID | MS_NOEXEC | MS_NODEV | MS_ACTIVE | MS_BORN | MS_NOATIME | MS_NODIRATIME | MS_RELATIME| MS_KERNMOUNT | MS_STRICTATIME); -- 2.0.4 From db181ce011e3c033328608299cd6fac06ea50130 Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Tue, 29 Jul 2014 15:50:44 -0700 Subject: [PATCH 5/5] mnt: Add tests for unprivileged remount cases that have found to be faulty Kenton Varda discovered that by remounting a read-only bind mount read-only in a user namespace the MNT_LOCK_READONLY bit would be cleared, allowing an unprivileged user to the remount a read-only mount read-write. Upon review of the code in remount it was discovered that the code allowed nosuid, noexec, and nodev to be cleared. It was also discovered that the code was allowing the per mount atime flags to be changed. The first naive patch to fix these issues contained the flaw that using default atime settings when remounting a filesystem could be disallowed. To avoid this problems in the future add tests to ensure unprivileged remounts are succeeding and failing at the appropriate times. Cc: stable@vger.kernel.org Acked-by: Serge E. Hallyn Signed-off-by: "Eric W. Biederman" --- tools/testing/selftests/Makefile | 1 + tools/testing/selftests/mount/Makefile | 17 ++ .../selftests/mount/unprivileged-remount-test.c | 242 +++++++++++++++++++++ 3 files changed, 260 insertions(+) create mode 100644 tools/testing/selftests/mount/Makefile create mode 100644 tools/testing/selftests/mount/unprivileged-remount-test.c diff --git a/tools/testing/selftests/Makefile b/tools/testing/selftests/Makefile index e66e710cc595..0a8a9db43d34 100644 --- a/tools/testing/selftests/Makefile +++ b/tools/testing/selftests/Makefile @@ -4,6 +4,7 @@ TARGETS += efivarfs TARGETS += kcmp TARGETS += memory-hotplug TARGETS += mqueue +TARGETS += mount TARGETS += net TARGETS += ptrace TARGETS += timers diff --git a/tools/testing/selftests/mount/Makefile b/tools/testing/selftests/mount/Makefile new file mode 100644 index 000000000000..337d853c2b72 --- /dev/null +++ b/tools/testing/selftests/mount/Makefile @@ -0,0 +1,17 @@ +# Makefile for mount selftests. + +all: unprivileged-remount-test + +unprivileged-remount-test: unprivileged-remount-test.c + gcc -Wall -O2 unprivileged-remount-test.c -o unprivileged-remount-test + +# Allow specific tests to be selected. +test_unprivileged_remount: unprivileged-remount-test + @if [ -f /proc/self/uid_map ] ; then ./unprivileged-remount-test ; fi + +run_tests: all test_unprivileged_remount + +clean: + rm -f unprivileged-remount-test + +.PHONY: all test_unprivileged_remount diff --git a/tools/testing/selftests/mount/unprivileged-remount-test.c b/tools/testing/selftests/mount/unprivileged-remount-test.c new file mode 100644 index 000000000000..1b3ff2fda4d0 --- /dev/null +++ b/tools/testing/selftests/mount/unprivileged-remount-test.c @@ -0,0 +1,242 @@ +#define _GNU_SOURCE +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#ifndef CLONE_NEWNS +# define CLONE_NEWNS 0x00020000 +#endif +#ifndef CLONE_NEWUTS +# define CLONE_NEWUTS 0x04000000 +#endif +#ifndef CLONE_NEWIPC +# define CLONE_NEWIPC 0x08000000 +#endif +#ifndef CLONE_NEWNET +# define CLONE_NEWNET 0x40000000 +#endif +#ifndef CLONE_NEWUSER +# define CLONE_NEWUSER 0x10000000 +#endif +#ifndef CLONE_NEWPID +# define CLONE_NEWPID 0x20000000 +#endif + +#ifndef MS_RELATIME +#define MS_RELATIME (1 << 21) +#endif +#ifndef MS_STRICTATIME +#define MS_STRICTATIME (1 << 24) +#endif + +static void die(char *fmt, ...) +{ + va_list ap; + va_start(ap, fmt); + vfprintf(stderr, fmt, ap); + va_end(ap); + exit(EXIT_FAILURE); +} + +static void write_file(char *filename, char *fmt, ...) +{ + char buf[4096]; + int fd; + ssize_t written; + int buf_len; + va_list ap; + + va_start(ap, fmt); + buf_len = vsnprintf(buf, sizeof(buf), fmt, ap); + va_end(ap); + if (buf_len < 0) { + die("vsnprintf failed: %s\n", + strerror(errno)); + } + if (buf_len >= sizeof(buf)) { + die("vsnprintf output truncated\n"); + } + + fd = open(filename, O_WRONLY); + if (fd < 0) { + die("open of %s failed: %s\n", + filename, strerror(errno)); + } + written = write(fd, buf, buf_len); + if (written != buf_len) { + if (written >= 0) { + die("short write to %s\n", filename); + } else { + die("write to %s failed: %s\n", + filename, strerror(errno)); + } + } + if (close(fd) != 0) { + die("close of %s failed: %s\n", + filename, strerror(errno)); + } +} + +static void create_and_enter_userns(void) +{ + uid_t uid; + gid_t gid; + + uid = getuid(); + gid = getgid(); + + if (unshare(CLONE_NEWUSER) !=0) { + die("unshare(CLONE_NEWUSER) failed: %s\n", + strerror(errno)); + } + + write_file("/proc/self/uid_map", "0 %d 1", uid); + write_file("/proc/self/gid_map", "0 %d 1", gid); + + if (setgroups(0, NULL) != 0) { + die("setgroups failed: %s\n", + strerror(errno)); + } + if (setgid(0) != 0) { + die ("setgid(0) failed %s\n", + strerror(errno)); + } + if (setuid(0) != 0) { + die("setuid(0) failed %s\n", + strerror(errno)); + } +} + +static +bool test_unpriv_remount(int mount_flags, int remount_flags, int invalid_flags) +{ + pid_t child; + + child = fork(); + if (child == -1) { + die("fork failed: %s\n", + strerror(errno)); + } + if (child != 0) { /* parent */ + pid_t pid; + int status; + pid = waitpid(child, &status, 0); + if (pid == -1) { + die("waitpid failed: %s\n", + strerror(errno)); + } + if (pid != child) { + die("waited for %d got %d\n", + child, pid); + } + if (!WIFEXITED(status)) { + die("child did not terminate cleanly\n"); + } + return WEXITSTATUS(status) == EXIT_SUCCESS ? true : false; + } + + create_and_enter_userns(); + if (unshare(CLONE_NEWNS) != 0) { + die("unshare(CLONE_NEWNS) failed: %s\n", + strerror(errno)); + } + + if (mount("testing", "/tmp", "ramfs", mount_flags, NULL) != 0) { + die("mount of /tmp failed: %s\n", + strerror(errno)); + } + + create_and_enter_userns(); + + if (unshare(CLONE_NEWNS) != 0) { + die("unshare(CLONE_NEWNS) failed: %s\n", + strerror(errno)); + } + + if (mount("/tmp", "/tmp", "none", + MS_REMOUNT | MS_BIND | remount_flags, NULL) != 0) { + /* system("cat /proc/self/mounts"); */ + die("remount of /tmp failed: %s\n", + strerror(errno)); + } + + if (mount("/tmp", "/tmp", "none", + MS_REMOUNT | MS_BIND | invalid_flags, NULL) == 0) { + /* system("cat /proc/self/mounts"); */ + die("remount of /tmp with invalid flags " + "succeeded unexpectedly\n"); + } + exit(EXIT_SUCCESS); +} + +static bool test_unpriv_remount_simple(int mount_flags) +{ + return test_unpriv_remount(mount_flags, mount_flags, 0); +} + +static bool test_unpriv_remount_atime(int mount_flags, int invalid_flags) +{ + return test_unpriv_remount(mount_flags, mount_flags, invalid_flags); +} + +int main(int argc, char **argv) +{ + if (!test_unpriv_remount_simple(MS_RDONLY|MS_NODEV)) { + die("MS_RDONLY malfunctions\n"); + } + if (!test_unpriv_remount_simple(MS_NODEV)) { + die("MS_NODEV malfunctions\n"); + } + if (!test_unpriv_remount_simple(MS_NOSUID|MS_NODEV)) { + die("MS_NOSUID malfunctions\n"); + } + if (!test_unpriv_remount_simple(MS_NOEXEC|MS_NODEV)) { + die("MS_NOEXEC malfunctions\n"); + } + if (!test_unpriv_remount_atime(MS_RELATIME|MS_NODEV, + MS_NOATIME|MS_NODEV)) + { + die("MS_RELATIME malfunctions\n"); + } + if (!test_unpriv_remount_atime(MS_STRICTATIME|MS_NODEV, + MS_NOATIME|MS_NODEV)) + { + die("MS_STRICTATIME malfunctions\n"); + } + if (!test_unpriv_remount_atime(MS_NOATIME|MS_NODEV, + MS_STRICTATIME|MS_NODEV)) + { + die("MS_RELATIME malfunctions\n"); + } + if (!test_unpriv_remount_atime(MS_RELATIME|MS_NODIRATIME|MS_NODEV, + MS_NOATIME|MS_NODEV)) + { + die("MS_RELATIME malfunctions\n"); + } + if (!test_unpriv_remount_atime(MS_STRICTATIME|MS_NODIRATIME|MS_NODEV, + MS_NOATIME|MS_NODEV)) + { + die("MS_RELATIME malfunctions\n"); + } + if (!test_unpriv_remount_atime(MS_NOATIME|MS_NODIRATIME|MS_NODEV, + MS_STRICTATIME|MS_NODEV)) + { + die("MS_RELATIME malfunctions\n"); + } + if (!test_unpriv_remount(MS_STRICTATIME|MS_NODEV, MS_NODEV, + MS_NOATIME|MS_NODEV)) + { + die("Default atime malfunctions\n"); + } + return EXIT_SUCCESS; +} -- 2.0.4