Blob Blame History Raw
diff --git a/src/condor_daemon_core.V6/condor_daemon_core.h b/src/condor_daemon_core.V6/condor_daemon_core.h
index 3562577..d9d1736 100644
--- a/src/condor_daemon_core.V6/condor_daemon_core.h
+++ b/src/condor_daemon_core.V6/condor_daemon_core.h
@@ -192,6 +192,7 @@ struct FamilyInfo {
 	gid_t* group_ptr;
 #endif
 	const char* glexec_proxy;
+	bool want_pid_namespace;
 	const char* cgroup;
 
 	FamilyInfo() {
@@ -201,6 +202,7 @@ struct FamilyInfo {
 		group_ptr = NULL;
 #endif
 		glexec_proxy = NULL;
+		want_pid_namespace = false;
 		cgroup = NULL;
 	}
 };
diff --git a/src/condor_daemon_core.V6/daemon_core.cpp b/src/condor_daemon_core.V6/daemon_core.cpp
index e058fd3..74fe8a0 100644
--- a/src/condor_daemon_core.V6/daemon_core.cpp
+++ b/src/condor_daemon_core.V6/daemon_core.cpp
@@ -34,6 +34,7 @@
 #if HAVE_CLONE
 #include <sched.h>
 #include <sys/syscall.h>
+#include <sys/mount.h>
 #endif
 
 #if HAVE_RESOLV_H && HAVE_DECL_RES_INIT
@@ -112,6 +113,10 @@ CRITICAL_SECTION Big_fat_mutex; // coarse grained mutex for debugging purposes
 #include <sched.h>
 #endif
 
+#if !defined(CLONE_NEWPID)
+#define CLONE_NEWPID 0x20000000
+#endif
+
 static const char* EMPTY_DESCRIP = "<NULL>";
 
 // special errno values that may be returned from Create_Process
@@ -6566,7 +6571,9 @@ public:
 	   m_affinity_mask(affinity_mask),
  	   m_fs_remap(fs_remap),
 	   m_wrote_tracking_gid(false),
-	   m_no_dprintf_allowed(false)
+	   m_no_dprintf_allowed(false),
+	   m_clone_newpid_pid(-1),
+	   m_clone_newpid_ppid(-1)
 	{
 	}
 
@@ -6627,6 +6634,10 @@ private:
 	bool m_wrote_tracking_gid;
 	bool m_no_dprintf_allowed;
 	priv_state m_priv_state;
+	pid_t m_clone_newpid_pid;
+	pid_t m_clone_newpid_ppid;
+
+	pid_t fork(int);
 };
 
 enum {
@@ -6650,7 +6661,19 @@ pid_t CreateProcessForkit::clone_safe_getpid() {
 		// the pid of the parent process (presumably due to internal
 		// caching in libc).  Therefore, use the syscall to get
 		// the answer directly.
-	return syscall(SYS_getpid);
+
+	int retval = syscall(SYS_getpid);
+
+		// If we were fork'd with CLONE_NEWPID, we think our PID is 1.
+		// In this case, ask the parent!
+	if (retval == 1) {
+		if (m_clone_newpid_pid == -1) {
+			EXCEPT("getpid is 1!");
+		}
+		retval = m_clone_newpid_pid;
+	}
+
+	return retval;
 #else
 	return ::getpid();
 #endif
@@ -6659,12 +6682,115 @@ pid_t CreateProcessForkit::clone_safe_getppid() {
 #if HAVE_CLONE
 		// See above comment for clone_safe_getpid() for explanation of
 		// why we need to do this.
-	return syscall(SYS_getppid);
+	
+	int retval = syscall(SYS_getppid);
+
+		// If ppid is 0, then either Condor is init (DEAR GOD) or we
+		// were created with CLONE_NEWPID; ask the parent!
+	if (retval == 0) {
+		if (m_clone_newpid_ppid == -1) {
+			EXCEPT("getppid is 0!");
+		}
+		retval = m_clone_newpid_ppid;
+	}
+
+	return retval;
 #else
 	return ::getppid();
 #endif
 }
 
+/**
+ * fork allows one to use certain clone syscall flags, but provides more
+ * familiar POSIX fork semantics.
+ * NOTES:
+ *   - We whitelist the flags you are allowed to pass.  Currently supported:
+ *     - CLONE_NEWPID.  Implies CLONE_NEWNS.
+ *       If the clone succeeds but the remount fails, the child calls _exit(1),
+ *       but the parent will return successfully.
+ *       It would be a simple fix to have the parent return the failure, if
+ *       someone desired.
+ *     Flags are whitelisted to help us adhere to the fork-like semantics (no
+ *     shared memory between parent and child, for example).  If you give other
+ *     flags, they are silently ignored.
+ *   - man pages indicate that clone on i386 is only fully functional when used
+ *     via ASM, not the vsyscall interface.  This doesn't appear to be relevant
+ *     to this particular use case.
+ *   - To avoid linking with pthreads (or copy/pasting lots of glibc code), I 
+ *     don't include integration with threads.  This means various threading
+ *     calls in the child may not function correctly (pre-exec; post-exec
+ *     should be fine), and pthreads might not notice when the child exits.
+ *     Traditional POSIX calls like wait will still function because the 
+ *     parent will receive the SIGCHLD.
+ *     This is simple to fix if someone desired, but I'd mostly rather not link
+ *     with pthreads.
+ */
+
+#define ALLOWED_FLAGS (SIGCHLD | CLONE_NEWPID | CLONE_NEWNS )
+
+pid_t CreateProcessForkit::fork(int flags) {
+
+    // If you don't need any fancy flags, just do the old boring POSIX call
+    if (flags == 0) {
+        return ::fork();
+    }
+
+#if HAVE_CLONE
+
+    int rw[2]; // Communication pipes for the CLONE_NEWPID case.
+
+    flags |= SIGCHLD; // The only necessary flag.
+    if (flags & CLONE_NEWPID) {
+        flags |= CLONE_NEWNS;
+	if (pipe(rw)) {
+		EXCEPT("UNABLE TO CREATE PIPE.");
+	}
+    }
+
+	// fork as root if we have our fancy flags.
+    priv_state orig_state = set_priv(PRIV_ROOT);
+    int retval = syscall(SYS_clone, ALLOWED_FLAGS & flags, 0, NULL, NULL);
+
+	// Child
+    if ((retval == 0) && (flags & CLONE_NEWPID)) {
+
+            // If we should have forked as non-root, make things in life final.
+        set_priv(orig_state);
+
+        if (full_read(rw[0], &m_clone_newpid_ppid, sizeof(pid_t)) != sizeof(pid_t)) {
+            EXCEPT("Unable to write into pipe.");
+        }
+        if (full_read(rw[0], &m_clone_newpid_pid, sizeof(pid_t)) != sizeof(pid_t)) {
+            EXCEPT("Unable to write into pipe.");
+        }
+
+	// Parent
+    } else if (retval > 0) {
+        set_priv(orig_state);
+	pid_t ppid = getpid(); // We are parent, so don't need clone_safe_pid.
+        if (full_write(rw[1], &ppid, sizeof(ppid)) != sizeof(ppid)) {
+            EXCEPT("Unable to write into pipe.");
+        }
+        if (full_write(rw[1], &retval, sizeof(ppid)) != sizeof(ppid)) {
+            EXCEPT("Unable to write into pipe.");
+        }
+    }
+	// retval=-1 falls through here.
+    if (flags & CLONE_NEWPID) {
+        close(rw[0]);
+        close(rw[1]);
+    }
+    return retval;
+
+#else
+
+    // Note we silently ignore flags if there's no clone on the platform.
+    return ::fork();
+
+#endif
+
+}
+
 pid_t CreateProcessForkit::fork_exec() {
 	pid_t newpid;
 
@@ -6736,7 +6862,11 @@ pid_t CreateProcessForkit::fork_exec() {
 	}
 #endif /* HAVE_CLONE */
 
-	newpid = fork();
+	int fork_flags = 0;
+	if (m_family_info) {
+		fork_flags |= m_family_info->want_pid_namespace ? CLONE_NEWPID : 0;
+	}
+	newpid = this->fork(fork_flags);
 	if( newpid == 0 ) {
 			// in child
 		enterCreateProcessChild(this);
diff --git a/src/condor_starter.V6.1/vanilla_proc.cpp b/src/condor_starter.V6.1/vanilla_proc.cpp
index 044cb10..8528ca7 100644
--- a/src/condor_starter.V6.1/vanilla_proc.cpp
+++ b/src/condor_starter.V6.1/vanilla_proc.cpp
@@ -360,6 +360,24 @@ VanillaProc::StartJob()
 		}
 	}
 
+#if defined(LINUX)
+	// On Linux kernel 2.6.24 and later, we can give each
+	// job its own PID namespace
+	if (param_boolean("USE_PID_NAMESPACES", false)) {
+		if (!can_switch_ids()) {
+			EXCEPT("USE_PID_NAMESPACES enabled, but can't perform this "
+				"call in Linux unless running as root.");
+		}
+		fi.want_pid_namespace = true;
+		if (!fs_remap) {
+			fs_remap = new FilesystemRemap();
+		}
+		fs_remap->RemapProc();
+	}
+	dprintf(D_FULLDEBUG, "PID namespace option: %s\n", fi.want_pid_namespace ? "true" : "false");
+#endif
+
+
 	// have OsProc start the job
 	//
 	int retval = OsProc::StartJob(&fi, fs_remap);
diff --git a/src/condor_utils/filesystem_remap.cpp b/src/condor_utils/filesystem_remap.cpp
index e0f2e61..735c744 100644
--- a/src/condor_utils/filesystem_remap.cpp
+++ b/src/condor_utils/filesystem_remap.cpp
@@ -29,7 +29,8 @@
 
 FilesystemRemap::FilesystemRemap() :
 	m_mappings(),
-	m_mounts_shared()
+	m_mounts_shared(),
+	m_remap_proc(false)
 {
 	ParseMountinfo();
 }
@@ -120,6 +121,9 @@ int FilesystemRemap::PerformMappings() {
 			break;
 		}
 	}
+	if ((!retval) && m_remap_proc) {
+		retval = mount("proc", "/proc", "proc", 0, NULL);
+	}
 #endif
 	return retval;
 }
@@ -148,6 +152,10 @@ std::string FilesystemRemap::RemapDir(std::string target) {
 	return target;
 }
 
+void FilesystemRemap::RemapProc() {
+	m_remap_proc = true;
+}
+
 /*
   Sample mountinfo contents (from http://www.kernel.org/doc/Documentation/filesystems/proc.txt):
   36 35 98:0 /mnt1 /mnt2 rw,noatime master:1 - ext3 /dev/root rw,errors=continue
diff --git a/src/condor_utils/filesystem_remap.h b/src/condor_utils/filesystem_remap.h
index 5e9362d..2e17476 100644
--- a/src/condor_utils/filesystem_remap.h
+++ b/src/condor_utils/filesystem_remap.h
@@ -74,6 +74,12 @@ public:
 	 */
 	std::string RemapFile(std::string);
 
+	/**
+	 * Indicate that we should remount /proc in the child process.
+	 * Necessary for PID namespaces.
+	 */
+	void RemapProc();
+
 private:
 
 	/**
@@ -89,6 +95,7 @@ private:
 	std::list<pair_strings> m_mappings;
 	std::list<pair_str_bool> m_mounts_shared;
	std::list<pair_strings> m_mounts_autofs;
+	bool m_remap_proc;
 
 };
 #endif