b3fe293
diff --git a/src/condor_daemon_core.V6/condor_daemon_core.h b/src/condor_daemon_core.V6/condor_daemon_core.h
b3fe293
index 3562577..d9d1736 100644
b3fe293
--- a/src/condor_daemon_core.V6/condor_daemon_core.h
b3fe293
+++ b/src/condor_daemon_core.V6/condor_daemon_core.h
b3fe293
@@ -192,6 +192,7 @@ struct FamilyInfo {
b3fe293
 	gid_t* group_ptr;
b3fe293
 #endif
b3fe293
 	const char* glexec_proxy;
b3fe293
+	bool want_pid_namespace;
b3fe293
 	const char* cgroup;
b3fe293
 
b3fe293
 	FamilyInfo() {
b3fe293
@@ -201,6 +202,7 @@ struct FamilyInfo {
b3fe293
 		group_ptr = NULL;
b3fe293
 #endif
b3fe293
 		glexec_proxy = NULL;
b3fe293
+		want_pid_namespace = false;
b3fe293
 		cgroup = NULL;
b3fe293
 	}
b3fe293
 };
b3fe293
diff --git a/src/condor_daemon_core.V6/daemon_core.cpp b/src/condor_daemon_core.V6/daemon_core.cpp
b3fe293
index e058fd3..74fe8a0 100644
b3fe293
--- a/src/condor_daemon_core.V6/daemon_core.cpp
b3fe293
+++ b/src/condor_daemon_core.V6/daemon_core.cpp
b3fe293
@@ -34,6 +34,7 @@
b3fe293
 #if HAVE_CLONE
b3fe293
 #include <sched.h>
b3fe293
 #include <sys/syscall.h>
b3fe293
+#include <sys/mount.h>
b3fe293
 #endif
b3fe293
 
b3fe293
 #if HAVE_RESOLV_H && HAVE_DECL_RES_INIT
b3fe293
@@ -112,6 +113,10 @@ CRITICAL_SECTION Big_fat_mutex; // coarse grained mutex for debugging purposes
b3fe293
 #include <sched.h>
b3fe293
 #endif
b3fe293
 
b3fe293
+#if !defined(CLONE_NEWPID)
b3fe293
+#define CLONE_NEWPID 0x20000000
b3fe293
+#endif
b3fe293
+
b3fe293
 static const char* EMPTY_DESCRIP = "<NULL>";
b3fe293
 
b3fe293
 // special errno values that may be returned from Create_Process
b3fe293
@@ -6566,7 +6571,9 @@ public:
b3fe293
 	   m_affinity_mask(affinity_mask),
b3fe293
  	   m_fs_remap(fs_remap),
b3fe293
 	   m_wrote_tracking_gid(false),
b3fe293
-	   m_no_dprintf_allowed(false)
b3fe293
+	   m_no_dprintf_allowed(false),
b3fe293
+	   m_clone_newpid_pid(-1),
b3fe293
+	   m_clone_newpid_ppid(-1)
b3fe293
 	{
b3fe293
 	}
b3fe293
 
b3fe293
@@ -6627,6 +6634,10 @@ private:
b3fe293
 	bool m_wrote_tracking_gid;
b3fe293
 	bool m_no_dprintf_allowed;
b3fe293
 	priv_state m_priv_state;
b3fe293
+	pid_t m_clone_newpid_pid;
b3fe293
+	pid_t m_clone_newpid_ppid;
b3fe293
+
b3fe293
+	pid_t fork(int);
b3fe293
 };
b3fe293
 
b3fe293
 enum {
b3fe293
@@ -6650,7 +6661,19 @@ pid_t CreateProcessForkit::clone_safe_getpid() {
b3fe293
 		// the pid of the parent process (presumably due to internal
b3fe293
 		// caching in libc).  Therefore, use the syscall to get
b3fe293
 		// the answer directly.
b3fe293
-	return syscall(SYS_getpid);
b3fe293
+
b3fe293
+	int retval = syscall(SYS_getpid);
b3fe293
+
b3fe293
+		// If we were fork'd with CLONE_NEWPID, we think our PID is 1.
b3fe293
+		// In this case, ask the parent!
b3fe293
+	if (retval == 1) {
b3fe293
+		if (m_clone_newpid_pid == -1) {
b3fe293
+			EXCEPT("getpid is 1!");
b3fe293
+		}
b3fe293
+		retval = m_clone_newpid_pid;
b3fe293
+	}
b3fe293
+
b3fe293
+	return retval;
b3fe293
 #else
b3fe293
 	return ::getpid();
b3fe293
 #endif
b3fe293
@@ -6659,12 +6682,115 @@ pid_t CreateProcessForkit::clone_safe_getppid() {
b3fe293
 #if HAVE_CLONE
b3fe293
 		// See above comment for clone_safe_getpid() for explanation of
b3fe293
 		// why we need to do this.
b3fe293
-	return syscall(SYS_getppid);
b3fe293
+	
b3fe293
+	int retval = syscall(SYS_getppid);
b3fe293
+
b3fe293
+		// If ppid is 0, then either Condor is init (DEAR GOD) or we
b3fe293
+		// were created with CLONE_NEWPID; ask the parent!
b3fe293
+	if (retval == 0) {
b3fe293
+		if (m_clone_newpid_ppid == -1) {
b3fe293
+			EXCEPT("getppid is 0!");
b3fe293
+		}
b3fe293
+		retval = m_clone_newpid_ppid;
b3fe293
+	}
b3fe293
+
b3fe293
+	return retval;
b3fe293
 #else
b3fe293
 	return ::getppid();
b3fe293
 #endif
b3fe293
 }
b3fe293
 
b3fe293
+/**
b3fe293
+ * fork allows one to use certain clone syscall flags, but provides more
b3fe293
+ * familiar POSIX fork semantics.
b3fe293
+ * NOTES:
b3fe293
+ *   - We whitelist the flags you are allowed to pass.  Currently supported:
b3fe293
+ *     - CLONE_NEWPID.  Implies CLONE_NEWNS.
b3fe293
+ *       If the clone succeeds but the remount fails, the child calls _exit(1),
b3fe293
+ *       but the parent will return successfully.
b3fe293
+ *       It would be a simple fix to have the parent return the failure, if
b3fe293
+ *       someone desired.
b3fe293
+ *     Flags are whitelisted to help us adhere to the fork-like semantics (no
b3fe293
+ *     shared memory between parent and child, for example).  If you give other
b3fe293
+ *     flags, they are silently ignored.
b3fe293
+ *   - man pages indicate that clone on i386 is only fully functional when used
b3fe293
+ *     via ASM, not the vsyscall interface.  This doesn't appear to be relevant
b3fe293
+ *     to this particular use case.
b3fe293
+ *   - To avoid linking with pthreads (or copy/pasting lots of glibc code), I 
b3fe293
+ *     don't include integration with threads.  This means various threading
b3fe293
+ *     calls in the child may not function correctly (pre-exec; post-exec
b3fe293
+ *     should be fine), and pthreads might not notice when the child exits.
b3fe293
+ *     Traditional POSIX calls like wait will still function because the 
b3fe293
+ *     parent will receive the SIGCHLD.
b3fe293
+ *     This is simple to fix if someone desired, but I'd mostly rather not link
b3fe293
+ *     with pthreads.
b3fe293
+ */
b3fe293
+
b3fe293
+#define ALLOWED_FLAGS (SIGCHLD | CLONE_NEWPID | CLONE_NEWNS )
b3fe293
+
b3fe293
+pid_t CreateProcessForkit::fork(int flags) {
b3fe293
+
b3fe293
+    // If you don't need any fancy flags, just do the old boring POSIX call
b3fe293
+    if (flags == 0) {
b3fe293
+        return ::fork();
b3fe293
+    }
b3fe293
+
b3fe293
+#if HAVE_CLONE
b3fe293
+
b3fe293
+    int rw[2]; // Communication pipes for the CLONE_NEWPID case.
b3fe293
+
b3fe293
+    flags |= SIGCHLD; // The only necessary flag.
b3fe293
+    if (flags & CLONE_NEWPID) {
b3fe293
+        flags |= CLONE_NEWNS;
b3fe293
+	if (pipe(rw)) {
b3fe293
+		EXCEPT("UNABLE TO CREATE PIPE.");
b3fe293
+	}
b3fe293
+    }
b3fe293
+
b3fe293
+	// fork as root if we have our fancy flags.
b3fe293
+    priv_state orig_state = set_priv(PRIV_ROOT);
b3fe293
+    int retval = syscall(SYS_clone, ALLOWED_FLAGS & flags, 0, NULL, NULL);
b3fe293
+
b3fe293
+	// Child
b3fe293
+    if ((retval == 0) && (flags & CLONE_NEWPID)) {
b3fe293
+
b3fe293
+            // If we should have forked as non-root, make things in life final.
b3fe293
+        set_priv(orig_state);
b3fe293
+
b3fe293
+        if (full_read(rw[0], &m_clone_newpid_ppid, sizeof(pid_t)) != sizeof(pid_t)) {
b3fe293
+            EXCEPT("Unable to write into pipe.");
b3fe293
+        }
b3fe293
+        if (full_read(rw[0], &m_clone_newpid_pid, sizeof(pid_t)) != sizeof(pid_t)) {
b3fe293
+            EXCEPT("Unable to write into pipe.");
b3fe293
+        }
b3fe293
+
b3fe293
+	// Parent
b3fe293
+    } else if (retval > 0) {
b3fe293
+        set_priv(orig_state);
b3fe293
+	pid_t ppid = getpid(); // We are parent, so don't need clone_safe_pid.
b3fe293
+        if (full_write(rw[1], &ppid, sizeof(ppid)) != sizeof(ppid)) {
b3fe293
+            EXCEPT("Unable to write into pipe.");
b3fe293
+        }
b3fe293
+        if (full_write(rw[1], &retval, sizeof(ppid)) != sizeof(ppid)) {
b3fe293
+            EXCEPT("Unable to write into pipe.");
b3fe293
+        }
b3fe293
+    }
b3fe293
+	// retval=-1 falls through here.
b3fe293
+    if (flags & CLONE_NEWPID) {
b3fe293
+        close(rw[0]);
b3fe293
+        close(rw[1]);
b3fe293
+    }
b3fe293
+    return retval;
b3fe293
+
b3fe293
+#else
b3fe293
+
b3fe293
+    // Note we silently ignore flags if there's no clone on the platform.
b3fe293
+    return ::fork();
b3fe293
+
b3fe293
+#endif
b3fe293
+
b3fe293
+}
b3fe293
+
b3fe293
 pid_t CreateProcessForkit::fork_exec() {
b3fe293
 	pid_t newpid;
b3fe293
 
b3fe293
@@ -6736,7 +6862,11 @@ pid_t CreateProcessForkit::fork_exec() {
b3fe293
 	}
b3fe293
 #endif /* HAVE_CLONE */
b3fe293
 
b3fe293
-	newpid = fork();
b3fe293
+	int fork_flags = 0;
b3fe293
+	if (m_family_info) {
b3fe293
+		fork_flags |= m_family_info->want_pid_namespace ? CLONE_NEWPID : 0;
b3fe293
+	}
b3fe293
+	newpid = this->fork(fork_flags);
b3fe293
 	if( newpid == 0 ) {
b3fe293
 			// in child
b3fe293
 		enterCreateProcessChild(this);
b3fe293
diff --git a/src/condor_starter.V6.1/vanilla_proc.cpp b/src/condor_starter.V6.1/vanilla_proc.cpp
b3fe293
index 044cb10..8528ca7 100644
b3fe293
--- a/src/condor_starter.V6.1/vanilla_proc.cpp
b3fe293
+++ b/src/condor_starter.V6.1/vanilla_proc.cpp
b3fe293
@@ -360,6 +360,24 @@ VanillaProc::StartJob()
b3fe293
 		}
b3fe293
 	}
b3fe293
 
b3fe293
+#if defined(LINUX)
b3fe293
+	// On Linux kernel 2.6.24 and later, we can give each
b3fe293
+	// job its own PID namespace
b3fe293
+	if (param_boolean("USE_PID_NAMESPACES", false)) {
b3fe293
+		if (!can_switch_ids()) {
b3fe293
+			EXCEPT("USE_PID_NAMESPACES enabled, but can't perform this "
b3fe293
+				"call in Linux unless running as root.");
b3fe293
+		}
b3fe293
+		fi.want_pid_namespace = true;
b3fe293
+		if (!fs_remap) {
b3fe293
+			fs_remap = new FilesystemRemap();
b3fe293
+		}
b3fe293
+		fs_remap->RemapProc();
b3fe293
+	}
b3fe293
+	dprintf(D_FULLDEBUG, "PID namespace option: %s\n", fi.want_pid_namespace ? "true" : "false");
b3fe293
+#endif
b3fe293
+
b3fe293
+
b3fe293
 	// have OsProc start the job
b3fe293
 	//
b3fe293
 	int retval = OsProc::StartJob(&fi, fs_remap);
b3fe293
diff --git a/src/condor_utils/filesystem_remap.cpp b/src/condor_utils/filesystem_remap.cpp
b3fe293
index e0f2e61..735c744 100644
b3fe293
--- a/src/condor_utils/filesystem_remap.cpp
b3fe293
+++ b/src/condor_utils/filesystem_remap.cpp
b3fe293
@@ -29,7 +29,8 @@
b3fe293
 
b3fe293
 FilesystemRemap::FilesystemRemap() :
b3fe293
 	m_mappings(),
b3fe293
-	m_mounts_shared()
b3fe293
+	m_mounts_shared(),
b3fe293
+	m_remap_proc(false)
b3fe293
 {
b3fe293
 	ParseMountinfo();
b3fe293
 }
b3fe293
@@ -120,6 +121,9 @@ int FilesystemRemap::PerformMappings() {
b3fe293
 			break;
b3fe293
 		}
b3fe293
 	}
b3fe293
+	if ((!retval) && m_remap_proc) {
b3fe293
+		retval = mount("proc", "/proc", "proc", 0, NULL);
b3fe293
+	}
b3fe293
 #endif
b3fe293
 	return retval;
b3fe293
 }
b3fe293
@@ -148,6 +152,10 @@ std::string FilesystemRemap::RemapDir(std::string target) {
b3fe293
 	return target;
b3fe293
 }
b3fe293
 
b3fe293
+void FilesystemRemap::RemapProc() {
b3fe293
+	m_remap_proc = true;
b3fe293
+}
b3fe293
+
b3fe293
 /*
b3fe293
   Sample mountinfo contents (from http://www.kernel.org/doc/Documentation/filesystems/proc.txt):
b3fe293
   36 35 98:0 /mnt1 /mnt2 rw,noatime master:1 - ext3 /dev/root rw,errors=continue
b3fe293
diff --git a/src/condor_utils/filesystem_remap.h b/src/condor_utils/filesystem_remap.h
b3fe293
index 5e9362d..2e17476 100644
b3fe293
--- a/src/condor_utils/filesystem_remap.h
b3fe293
+++ b/src/condor_utils/filesystem_remap.h
b3fe293
@@ -74,6 +74,12 @@ public:
b3fe293
 	 */
b3fe293
 	std::string RemapFile(std::string);
b3fe293
 
b3fe293
+	/**
b3fe293
+	 * Indicate that we should remount /proc in the child process.
b3fe293
+	 * Necessary for PID namespaces.
b3fe293
+	 */
b3fe293
+	void RemapProc();
b3fe293
+
b3fe293
 private:
b3fe293
 
b3fe293
 	/**
b3fe293
@@ -89,6 +95,7 @@ private:
b3fe293
 	std::list<pair_strings> m_mappings;
b3fe293
 	std::list<pair_str_bool> m_mounts_shared;
b3fe293
	std::list<pair_strings> m_mounts_autofs;
b3fe293
+	bool m_remap_proc;
b3fe293
 
b3fe293
 };
b3fe293
 #endif