diff --git a/src/condor_daemon_core.V6/condor_daemon_core.h b/src/condor_daemon_core.V6/condor_daemon_core.h
index 3562577..d9d1736 100644
--- a/src/condor_daemon_core.V6/condor_daemon_core.h
+++ b/src/condor_daemon_core.V6/condor_daemon_core.h
@@ -192,6 +192,7 @@ struct FamilyInfo {
gid_t* group_ptr;
#endif
const char* glexec_proxy;
+ bool want_pid_namespace;
const char* cgroup;
FamilyInfo() {
@@ -201,6 +202,7 @@ struct FamilyInfo {
group_ptr = NULL;
#endif
glexec_proxy = NULL;
+ want_pid_namespace = false;
cgroup = NULL;
}
};
diff --git a/src/condor_daemon_core.V6/daemon_core.cpp b/src/condor_daemon_core.V6/daemon_core.cpp
index e058fd3..74fe8a0 100644
--- a/src/condor_daemon_core.V6/daemon_core.cpp
+++ b/src/condor_daemon_core.V6/daemon_core.cpp
@@ -34,6 +34,7 @@
#if HAVE_CLONE
#include <sched.h>
#include <sys/syscall.h>
+#include <sys/mount.h>
#endif
#if HAVE_RESOLV_H && HAVE_DECL_RES_INIT
@@ -112,6 +113,10 @@ CRITICAL_SECTION Big_fat_mutex; // coarse grained mutex for debugging purposes
#include <sched.h>
#endif
+#if !defined(CLONE_NEWPID)
+#define CLONE_NEWPID 0x20000000
+#endif
+
static const char* EMPTY_DESCRIP = "<NULL>";
// special errno values that may be returned from Create_Process
@@ -6566,7 +6571,9 @@ public:
m_affinity_mask(affinity_mask),
m_fs_remap(fs_remap),
m_wrote_tracking_gid(false),
- m_no_dprintf_allowed(false)
+ m_no_dprintf_allowed(false),
+ m_clone_newpid_pid(-1),
+ m_clone_newpid_ppid(-1)
{
}
@@ -6627,6 +6634,10 @@ private:
bool m_wrote_tracking_gid;
bool m_no_dprintf_allowed;
priv_state m_priv_state;
+ pid_t m_clone_newpid_pid;
+ pid_t m_clone_newpid_ppid;
+
+ pid_t fork(int);
};
enum {
@@ -6650,7 +6661,19 @@ pid_t CreateProcessForkit::clone_safe_getpid() {
// the pid of the parent process (presumably due to internal
// caching in libc). Therefore, use the syscall to get
// the answer directly.
- return syscall(SYS_getpid);
+
+ int retval = syscall(SYS_getpid);
+
+ // If we were fork'd with CLONE_NEWPID, we think our PID is 1.
+ // In this case, ask the parent!
+ if (retval == 1) {
+ if (m_clone_newpid_pid == -1) {
+ EXCEPT("getpid is 1!");
+ }
+ retval = m_clone_newpid_pid;
+ }
+
+ return retval;
#else
return ::getpid();
#endif
@@ -6659,12 +6682,115 @@ pid_t CreateProcessForkit::clone_safe_getppid() {
#if HAVE_CLONE
// See above comment for clone_safe_getpid() for explanation of
// why we need to do this.
- return syscall(SYS_getppid);
+
+ int retval = syscall(SYS_getppid);
+
+ // If ppid is 0, then either Condor is init (DEAR GOD) or we
+ // were created with CLONE_NEWPID; ask the parent!
+ if (retval == 0) {
+ if (m_clone_newpid_ppid == -1) {
+ EXCEPT("getppid is 0!");
+ }
+ retval = m_clone_newpid_ppid;
+ }
+
+ return retval;
#else
return ::getppid();
#endif
}
+/**
+ * fork allows one to use certain clone syscall flags, but provides more
+ * familiar POSIX fork semantics.
+ * NOTES:
+ * - We whitelist the flags you are allowed to pass. Currently supported:
+ * - CLONE_NEWPID. Implies CLONE_NEWNS.
+ * If the clone succeeds but the remount fails, the child calls _exit(1),
+ * but the parent will return successfully.
+ * It would be a simple fix to have the parent return the failure, if
+ * someone desired.
+ * Flags are whitelisted to help us adhere to the fork-like semantics (no
+ * shared memory between parent and child, for example). If you give other
+ * flags, they are silently ignored.
+ * - man pages indicate that clone on i386 is only fully functional when used
+ * via ASM, not the vsyscall interface. This doesn't appear to be relevant
+ * to this particular use case.
+ * - To avoid linking with pthreads (or copy/pasting lots of glibc code), I
+ * don't include integration with threads. This means various threading
+ * calls in the child may not function correctly (pre-exec; post-exec
+ * should be fine), and pthreads might not notice when the child exits.
+ * Traditional POSIX calls like wait will still function because the
+ * parent will receive the SIGCHLD.
+ * This is simple to fix if someone desired, but I'd mostly rather not link
+ * with pthreads.
+ */
+
+#define ALLOWED_FLAGS (SIGCHLD | CLONE_NEWPID | CLONE_NEWNS )
+
+pid_t CreateProcessForkit::fork(int flags) {
+
+ // If you don't need any fancy flags, just do the old boring POSIX call
+ if (flags == 0) {
+ return ::fork();
+ }
+
+#if HAVE_CLONE
+
+ int rw[2]; // Communication pipes for the CLONE_NEWPID case.
+
+ flags |= SIGCHLD; // The only necessary flag.
+ if (flags & CLONE_NEWPID) {
+ flags |= CLONE_NEWNS;
+ if (pipe(rw)) {
+ EXCEPT("UNABLE TO CREATE PIPE.");
+ }
+ }
+
+ // fork as root if we have our fancy flags.
+ priv_state orig_state = set_priv(PRIV_ROOT);
+ int retval = syscall(SYS_clone, ALLOWED_FLAGS & flags, 0, NULL, NULL);
+
+ // Child
+ if ((retval == 0) && (flags & CLONE_NEWPID)) {
+
+ // If we should have forked as non-root, make things in life final.
+ set_priv(orig_state);
+
+ if (full_read(rw[0], &m_clone_newpid_ppid, sizeof(pid_t)) != sizeof(pid_t)) {
+ EXCEPT("Unable to write into pipe.");
+ }
+ if (full_read(rw[0], &m_clone_newpid_pid, sizeof(pid_t)) != sizeof(pid_t)) {
+ EXCEPT("Unable to write into pipe.");
+ }
+
+ // Parent
+ } else if (retval > 0) {
+ set_priv(orig_state);
+ pid_t ppid = getpid(); // We are parent, so don't need clone_safe_pid.
+ if (full_write(rw[1], &ppid, sizeof(ppid)) != sizeof(ppid)) {
+ EXCEPT("Unable to write into pipe.");
+ }
+ if (full_write(rw[1], &retval, sizeof(ppid)) != sizeof(ppid)) {
+ EXCEPT("Unable to write into pipe.");
+ }
+ }
+ // retval=-1 falls through here.
+ if (flags & CLONE_NEWPID) {
+ close(rw[0]);
+ close(rw[1]);
+ }
+ return retval;
+
+#else
+
+ // Note we silently ignore flags if there's no clone on the platform.
+ return ::fork();
+
+#endif
+
+}
+
pid_t CreateProcessForkit::fork_exec() {
pid_t newpid;
@@ -6736,7 +6862,11 @@ pid_t CreateProcessForkit::fork_exec() {
}
#endif /* HAVE_CLONE */
- newpid = fork();
+ int fork_flags = 0;
+ if (m_family_info) {
+ fork_flags |= m_family_info->want_pid_namespace ? CLONE_NEWPID : 0;
+ }
+ newpid = this->fork(fork_flags);
if( newpid == 0 ) {
// in child
enterCreateProcessChild(this);
diff --git a/src/condor_starter.V6.1/vanilla_proc.cpp b/src/condor_starter.V6.1/vanilla_proc.cpp
index 044cb10..8528ca7 100644
--- a/src/condor_starter.V6.1/vanilla_proc.cpp
+++ b/src/condor_starter.V6.1/vanilla_proc.cpp
@@ -360,6 +360,24 @@ VanillaProc::StartJob()
}
}
+#if defined(LINUX)
+ // On Linux kernel 2.6.24 and later, we can give each
+ // job its own PID namespace
+ if (param_boolean("USE_PID_NAMESPACES", false)) {
+ if (!can_switch_ids()) {
+ EXCEPT("USE_PID_NAMESPACES enabled, but can't perform this "
+ "call in Linux unless running as root.");
+ }
+ fi.want_pid_namespace = true;
+ if (!fs_remap) {
+ fs_remap = new FilesystemRemap();
+ }
+ fs_remap->RemapProc();
+ }
+ dprintf(D_FULLDEBUG, "PID namespace option: %s\n", fi.want_pid_namespace ? "true" : "false");
+#endif
+
+
// have OsProc start the job
//
int retval = OsProc::StartJob(&fi, fs_remap);
diff --git a/src/condor_utils/filesystem_remap.cpp b/src/condor_utils/filesystem_remap.cpp
index e0f2e61..735c744 100644
--- a/src/condor_utils/filesystem_remap.cpp
+++ b/src/condor_utils/filesystem_remap.cpp
@@ -29,7 +29,8 @@
FilesystemRemap::FilesystemRemap() :
m_mappings(),
- m_mounts_shared()
+ m_mounts_shared(),
+ m_remap_proc(false)
{
ParseMountinfo();
}
@@ -120,6 +121,9 @@ int FilesystemRemap::PerformMappings() {
break;
}
}
+ if ((!retval) && m_remap_proc) {
+ retval = mount("proc", "/proc", "proc", 0, NULL);
+ }
#endif
return retval;
}
@@ -148,6 +152,10 @@ std::string FilesystemRemap::RemapDir(std::string target) {
return target;
}
+void FilesystemRemap::RemapProc() {
+ m_remap_proc = true;
+}
+
/*
Sample mountinfo contents (from http://www.kernel.org/doc/Documentation/filesystems/proc.txt):
36 35 98:0 /mnt1 /mnt2 rw,noatime master:1 - ext3 /dev/root rw,errors=continue
diff --git a/src/condor_utils/filesystem_remap.h b/src/condor_utils/filesystem_remap.h
index 5e9362d..2e17476 100644
--- a/src/condor_utils/filesystem_remap.h
+++ b/src/condor_utils/filesystem_remap.h
@@ -74,6 +74,12 @@ public:
*/
std::string RemapFile(std::string);
+ /**
+ * Indicate that we should remount /proc in the child process.
+ * Necessary for PID namespaces.
+ */
+ void RemapProc();
+
private:
/**
@@ -89,6 +95,7 @@ private:
std::list<pair_strings> m_mappings;
std::list<pair_str_bool> m_mounts_shared;
std::list<pair_strings> m_mounts_autofs;
+ bool m_remap_proc;
};
#endif