Blob Blame History Raw
From 2450dd660e2c3483e232176a09b0229851522807 Mon Sep 17 00:00:00 2001
From: Lennart Poettering <lennart@poettering.net>
Date: Mon, 13 Aug 2012 15:27:04 +0200
Subject: [PATCH] namespace: rework namespace support

- don't use pivot_root() anymore, just reuse root hierarchy
- first create all mounts, then mark them read-only so that we get the
  right behaviour when people want writable mounts inside of
  read-only mounts
- don't pass invalid combinations of MS_ constants to the kernel
(cherry picked from commit ac0930c892bc7979b4c9bc2a52e5e844650b025d)

Conflicts:
	src/core/execute.c
---
 man/systemd.exec.xml   |  19 ++---
 src/core/execute.c     |   3 +-
 src/core/namespace.c   | 218 +++++++++++++++++++++----------------------------
 src/core/switch-root.c |   6 ++
 src/shared/util.c      |   3 +-
 src/test/test-ns.c     |   5 +-
 6 files changed, 113 insertions(+), 141 deletions(-)

diff --git a/man/systemd.exec.xml b/man/systemd.exec.xml
index 09e9f0f..bd29ddf 100644
--- a/man/systemd.exec.xml
+++ b/man/systemd.exec.xml
@@ -1039,20 +1039,13 @@
                                 <option>shared</option>,
                                 <option>slave</option> or
                                 <option>private</option>, which
-                                control whether namespaces set up with
-                                <varname>ReadWriteDirectories=</varname>,
-                                <varname>ReadOnlyDirectories=</varname>
-                                and
-                                <varname>InaccessibleDirectories=</varname>
-                                receive or propagate new mounts
-                                from/to the main namespace. See
+                                control whether the file system
+                                namespace set up for this unit's
+                                processes will receive or propagate
+                                new mounts. See
                                 <citerefentry><refentrytitle>mount</refentrytitle><manvolnum>1</manvolnum></citerefentry>
-                                for details. Defaults to
-                                <option>shared</option>, i.e. the new
-                                namespace will both receive new mount
-                                points from the main namespace as well
-                                as propagate new mounts to
-                                it.</para></listitem>
+                                for details. Default to
+                                <option>shared</option>.</para></listitem>
                         </varlistentry>
 
                         <varlistentry>
diff --git a/src/core/execute.c b/src/core/execute.c
index 8a04fe6..931a9b0 100644
--- a/src/core/execute.c
+++ b/src/core/execute.c
@@ -1246,7 +1246,7 @@ int exec_spawn(ExecCommand *command,
                 if (strv_length(context->read_write_dirs) > 0 ||
                     strv_length(context->read_only_dirs) > 0 ||
                     strv_length(context->inaccessible_dirs) > 0 ||
-                    context->mount_flags != MS_SHARED ||
+                    context->mount_flags != 0 ||
                     context->private_tmp) {
                         err = setup_namespace(context->read_write_dirs,
                                               context->read_only_dirs,
@@ -1467,7 +1467,6 @@ void exec_context_init(ExecContext *c) {
         c->cpu_sched_policy = SCHED_OTHER;
         c->syslog_priority = LOG_DAEMON|LOG_INFO;
         c->syslog_level_prefix = true;
-        c->mount_flags = MS_SHARED;
         c->kill_signal = SIGTERM;
         c->send_sigkill = true;
         c->control_group_persistent = -1;
diff --git a/src/core/namespace.c b/src/core/namespace.c
index 0148f5f..d163762 100644
--- a/src/core/namespace.c
+++ b/src/core/namespace.c
@@ -41,13 +41,15 @@ typedef enum PathMode {
         /* This is ordered by priority! */
         INACCESSIBLE,
         READONLY,
-        PRIVATE,
+        PRIVATE_TMP,
+        PRIVATE_VAR_TMP,
         READWRITE
 } PathMode;
 
 typedef struct Path {
         const char *path;
         PathMode mode;
+        bool done;
 } Path;
 
 static int append_paths(Path **p, char **strv, PathMode mode) {
@@ -91,25 +93,22 @@ static int path_compare(const void *a, const void *b) {
         return 0;
 }
 
-static void drop_duplicates(Path *p, unsigned *n, bool *need_inaccessible, bool *need_private) {
+static void drop_duplicates(Path *p, unsigned *n, bool *need_inaccessible) {
         Path *f, *t, *previous;
 
         assert(p);
         assert(n);
         assert(need_inaccessible);
-        assert(need_private);
 
         for (f = p, t = p, previous = NULL; f < p+*n; f++) {
 
+                /* The first one wins */
                 if (previous && path_equal(f->path, previous->path))
                         continue;
 
                 t->path = f->path;
                 t->mode = f->mode;
 
-                if (t->mode == PRIVATE)
-                        *need_private = true;
-
                 if (t->mode == INACCESSIBLE)
                         *need_inaccessible = true;
 
@@ -121,65 +120,62 @@ static void drop_duplicates(Path *p, unsigned *n, bool *need_inaccessible, bool
         *n = t - p;
 }
 
-static int apply_mount(Path *p, const char *root_dir, const char *inaccessible_dir, const char *private_dir, unsigned long flags) {
+static int apply_mount(
+                Path *p,
+                const char *tmp_dir,
+                const char *var_tmp_dir,
+                const char *inaccessible_dir) {
+
         const char *what;
-        char *where;
         int r;
 
         assert(p);
-        assert(root_dir);
-        assert(inaccessible_dir);
-        assert(private_dir);
-
-        where = strappend(root_dir, p->path);
-        if (!where)
-                return -ENOMEM;
 
         switch (p->mode) {
 
         case INACCESSIBLE:
                 what = inaccessible_dir;
-                flags |= MS_RDONLY;
                 break;
 
         case READONLY:
-                flags |= MS_RDONLY;
-                /* Fall through */
-
         case READWRITE:
                 what = p->path;
                 break;
 
-        case PRIVATE:
-                what = private_dir;
+        case PRIVATE_TMP:
+                what = tmp_dir;
+                break;
+
+        case PRIVATE_VAR_TMP:
+                what = var_tmp_dir;
                 break;
 
         default:
                 assert_not_reached("Unknown mode");
         }
 
-        r = mount(what, where, NULL, MS_BIND|MS_REC, NULL);
-        if (r >= 0) {
-                log_debug("Successfully mounted %s to %s", what, where);
+        assert(what);
 
-                /* The bind mount will always inherit the original
-                 * flags. If we want to set any flag we need
-                 * to do so in a second independent step. */
-                if (flags)
-                        r = mount(NULL, where, NULL, MS_REMOUNT|MS_BIND|MS_REC|flags, NULL);
+        r = mount(what, p->path, NULL, MS_BIND, NULL);
+        if (r >= 0)
+                log_debug("Successfully mounted %s to %s", what, p->path);
 
-                /* Avoid exponential growth of trees */
-                if (r >= 0 && path_equal(p->path, "/"))
-                        r = mount(NULL, where, NULL, MS_REMOUNT|MS_BIND|flags, NULL);
+        return r;
+}
 
-                if (r < 0) {
-                        r = -errno;
-                        umount2(where, MNT_DETACH);
-                }
-        }
+static int make_read_only(Path *p) {
+        int r;
 
-        free(where);
-        return r;
+        assert(p);
+
+        if (p->mode != INACCESSIBLE && p->mode != READONLY)
+                return 0;
+
+        r = mount(NULL, p->path, NULL, MS_BIND|MS_REMOUNT|MS_RDONLY, NULL);
+        if (r < 0)
+                return -errno;
+
+        return 0;
 }
 
 int setup_namespace(
@@ -190,30 +186,26 @@ int setup_namespace(
                 unsigned long flags) {
 
         char
-                tmp_dir[] = "/tmp/systemd-namespace-XXXXXX",
-                root_dir[] = "/tmp/systemd-namespace-XXXXXX/root",
-                old_root_dir[] = "/tmp/systemd-namespace-XXXXXX/root/tmp/old-root-XXXXXX",
-                inaccessible_dir[] = "/tmp/systemd-namespace-XXXXXX/inaccessible",
-                private_dir[] = "/tmp/systemd-namespace-XXXXXX/private";
+                tmp_dir[] = "/tmp/systemd-private-XXXXXX",
+                var_tmp_dir[] = "/var/tmp/systemd-private-XXXXXX",
+                inaccessible_dir[] = "/tmp/systemd-inaccessible-XXXXXX";
 
         Path *paths, *p;
         unsigned n;
-        bool need_private = false, need_inaccessible = false;
-        bool remove_tmp = false, remove_root = false, remove_old_root = false, remove_inaccessible = false, remove_private = false;
+        bool need_inaccessible = false;
+        bool remove_tmp = false, remove_var_tmp = false, remove_inaccessible = false;
         int r;
-        const char *t;
+
+        if (!flags)
+                flags = MS_SHARED;
 
         n =
                 strv_length(writable) +
                 strv_length(readable) +
                 strv_length(inaccessible) +
-                (private_tmp ? 3 : 1);
-
-        paths = new(Path, n);
-        if (!paths)
-                return -ENOMEM;
+                (private_tmp ? 2 : 0);
 
-        p = paths;
+        p = paths = alloca(sizeof(Path) * n);
         if ((r = append_paths(&p, writable, READWRITE)) < 0 ||
             (r = append_paths(&p, readable, READONLY)) < 0 ||
             (r = append_paths(&p, inaccessible, INACCESSIBLE)) < 0)
@@ -221,60 +213,70 @@ int setup_namespace(
 
         if (private_tmp) {
                 p->path = "/tmp";
-                p->mode = PRIVATE;
+                p->mode = PRIVATE_TMP;
                 p++;
 
                 p->path = "/var/tmp";
-                p->mode = PRIVATE;
+                p->mode = PRIVATE_VAR_TMP;
                 p++;
         }
 
-        p->path = "/";
-        p->mode = READWRITE;
-        p++;
-
         assert(paths + n == p);
 
         qsort(paths, n, sizeof(Path), path_compare);
-        drop_duplicates(paths, &n, &need_inaccessible, &need_private);
+        drop_duplicates(paths, &n, &need_inaccessible);
 
-        if (!mkdtemp(tmp_dir)) {
-                r = -errno;
-                goto fail;
-        }
-        remove_tmp = true;
+        if (need_inaccessible) {
+                mode_t u;
+                char *d;
 
-        memcpy(root_dir, tmp_dir, sizeof(tmp_dir)-1);
-        if (mkdir(root_dir, 0777) < 0) {
-                r = -errno;
-                goto fail;
-        }
-        remove_root = true;
+                u = umask(0777);
+                d = mkdtemp(inaccessible_dir);
+                umask(u);
 
-        if (need_inaccessible) {
-                memcpy(inaccessible_dir, tmp_dir, sizeof(tmp_dir)-1);
-                if (mkdir(inaccessible_dir, 0) < 0) {
+                if (!d) {
                         r = -errno;
                         goto fail;
                 }
+
                 remove_inaccessible = true;
         }
 
-        if (need_private) {
+        if (private_tmp) {
                 mode_t u;
-
-                memcpy(private_dir, tmp_dir, sizeof(tmp_dir)-1);
+                char *d;
 
                 u = umask(0000);
-                if (mkdir(private_dir, 0777 + S_ISVTX) < 0) {
-                        umask(u);
+                d = mkdtemp(tmp_dir);
+                umask(u);
 
+                if (!d) {
                         r = -errno;
                         goto fail;
                 }
 
+                remove_tmp = true;
+
+                u = umask(0000);
+                d = mkdtemp(var_tmp_dir);
                 umask(u);
-                remove_private = true;
+
+                if (!d) {
+                        r = -errno;
+                        goto fail;
+                }
+
+                remove_var_tmp = true;
+
+                if (chmod(tmp_dir, 0777 + S_ISVTX) < 0) {
+                        r = -errno;
+                        goto fail;
+                }
+
+                if (chmod(var_tmp_dir, 0777 + S_ISVTX) < 0) {
+                        r = -errno;
+                        goto fail;
+                }
         }
 
         if (unshare(CLONE_NEWNS) < 0) {
@@ -282,7 +284,7 @@ int setup_namespace(
                 goto fail;
         }
 
-        /* Remount / as SLAVE so that nothing mounted in the namespace
+        /* Remount / as SLAVE so that nothing now mounted in the namespace
            shows up in the parent */
         if (mount(NULL, "/", NULL, MS_SLAVE|MS_REC, NULL) < 0) {
                 r = -errno;
@@ -290,69 +292,39 @@ int setup_namespace(
         }
 
         for (p = paths; p < paths + n; p++) {
-                r = apply_mount(p, root_dir, inaccessible_dir, private_dir, flags);
+                r = apply_mount(p, tmp_dir, var_tmp_dir, inaccessible_dir);
                 if (r < 0)
                         goto undo_mounts;
         }
 
-        memcpy(old_root_dir, tmp_dir, sizeof(tmp_dir)-1);
-        if (!mkdtemp(old_root_dir)) {
-                r = -errno;
-                goto undo_mounts;
-        }
-        remove_old_root = true;
-
-        if (chdir(root_dir) < 0) {
-                r = -errno;
-                goto undo_mounts;
+        for (p = paths; p < paths + n; p++) {
+                r = make_read_only(p);
+                if (r < 0)
+                        goto undo_mounts;
         }
 
-        if (pivot_root(root_dir, old_root_dir) < 0) {
+        /* Remount / as the desired mode */
+        if (mount(NULL, "/", NULL, flags|MS_REC, NULL) < 0) {
                 r = -errno;
                 goto undo_mounts;
         }
 
-        free(paths);
-
-        t = old_root_dir + sizeof(root_dir) - 1;
-        if (umount2(t, MNT_DETACH) < 0)
-                /* At this point it's too late to turn anything back,
-                 * since we are already in the new root. */
-                return -errno;
-
-        if (rmdir(t) < 0)
-                return -errno;
-
         return 0;
 
 undo_mounts:
-
-        for (p--; p >= paths; p--) {
-                char full_path[PATH_MAX];
-
-                snprintf(full_path, sizeof(full_path), "%s%s", root_dir, p->path);
-                char_array_0(full_path);
-
-                umount2(full_path, MNT_DETACH);
-        }
+        for (p = paths; p < paths + n; p++)
+                if (p->done)
+                        umount2(p->path, MNT_DETACH);
 
 fail:
-        if (remove_old_root)
-                rmdir(old_root_dir);
-
         if (remove_inaccessible)
                 rmdir(inaccessible_dir);
 
-        if (remove_private)
-                rmdir(private_dir);
-
-        if (remove_root)
-                rmdir(root_dir);
-
         if (remove_tmp)
                 rmdir(tmp_dir);
 
-        free(paths);
+        if (remove_var_tmp)
+                rmdir(var_tmp_dir);
 
         return r;
 }
diff --git a/src/core/switch-root.c b/src/core/switch-root.c
index efc7d34..150332a 100644
--- a/src/core/switch-root.c
+++ b/src/core/switch-root.c
@@ -115,6 +115,12 @@ int switch_root(const char *new_root) {
                 goto fail;
         }
 
+        if (chdir("/") < 0) {
+                r = -errno;
+                log_error("Failed to change directory: %m");
+                goto fail;
+        }
+
         if (old_root_fd >= 0) {
                 struct stat rb;
 
diff --git a/src/shared/util.c b/src/shared/util.c
index 0d48691..faf428f 100644
--- a/src/shared/util.c
+++ b/src/shared/util.c
@@ -3006,7 +3006,8 @@ unsigned long long random_ull(void) {
         uint64_t ull;
         ssize_t r;
 
-        if ((fd = open("/dev/urandom", O_RDONLY|O_CLOEXEC|O_NOCTTY)) < 0)
+        fd = open("/dev/urandom", O_RDONLY|O_CLOEXEC|O_NOCTTY);
+        if (fd < 0)
                 goto fallback;
 
         r = loop_read(fd, &ull, sizeof(ull), true);
diff --git a/src/test/test-ns.c b/src/test/test-ns.c
index e2bdfc5..a2724b4 100644
--- a/src/test/test-ns.c
+++ b/src/test/test-ns.c
@@ -34,7 +34,7 @@ int main(int argc, char *argv[]) {
                 NULL
         };
 
-        const char * const readable[] = {
+        const char * const readonly[] = {
                 "/",
                 "/usr",
                 "/boot",
@@ -48,7 +48,8 @@ int main(int argc, char *argv[]) {
 
         int r;
 
-        if ((r = setup_namespace((char**) writable, (char**) readable, (char**) inaccessible, true, MS_SHARED)) < 0) {
+        r = setup_namespace((char**) writable, (char**) readonly, (char**) inaccessible, true, 0);
+        if (r < 0) {
                 log_error("Failed to setup namespace: %s", strerror(-r));
                 return 1;
         }