|
|
b3fe293 |
diff --git a/src/condor_daemon_core.V6/condor_daemon_core.h b/src/condor_daemon_core.V6/condor_daemon_core.h
|
|
|
b3fe293 |
index 3562577..d9d1736 100644
|
|
|
b3fe293 |
--- a/src/condor_daemon_core.V6/condor_daemon_core.h
|
|
|
b3fe293 |
+++ b/src/condor_daemon_core.V6/condor_daemon_core.h
|
|
|
b3fe293 |
@@ -192,6 +192,7 @@ struct FamilyInfo {
|
|
|
b3fe293 |
gid_t* group_ptr;
|
|
|
b3fe293 |
#endif
|
|
|
b3fe293 |
const char* glexec_proxy;
|
|
|
b3fe293 |
+ bool want_pid_namespace;
|
|
|
b3fe293 |
const char* cgroup;
|
|
|
b3fe293 |
|
|
|
b3fe293 |
FamilyInfo() {
|
|
|
b3fe293 |
@@ -201,6 +202,7 @@ struct FamilyInfo {
|
|
|
b3fe293 |
group_ptr = NULL;
|
|
|
b3fe293 |
#endif
|
|
|
b3fe293 |
glexec_proxy = NULL;
|
|
|
b3fe293 |
+ want_pid_namespace = false;
|
|
|
b3fe293 |
cgroup = NULL;
|
|
|
b3fe293 |
}
|
|
|
b3fe293 |
};
|
|
|
b3fe293 |
diff --git a/src/condor_daemon_core.V6/daemon_core.cpp b/src/condor_daemon_core.V6/daemon_core.cpp
|
|
|
b3fe293 |
index e058fd3..74fe8a0 100644
|
|
|
b3fe293 |
--- a/src/condor_daemon_core.V6/daemon_core.cpp
|
|
|
b3fe293 |
+++ b/src/condor_daemon_core.V6/daemon_core.cpp
|
|
|
b3fe293 |
@@ -34,6 +34,7 @@
|
|
|
b3fe293 |
#if HAVE_CLONE
|
|
|
b3fe293 |
#include <sched.h>
|
|
|
b3fe293 |
#include <sys/syscall.h>
|
|
|
b3fe293 |
+#include <sys/mount.h>
|
|
|
b3fe293 |
#endif
|
|
|
b3fe293 |
|
|
|
b3fe293 |
#if HAVE_RESOLV_H && HAVE_DECL_RES_INIT
|
|
|
b3fe293 |
@@ -112,6 +113,10 @@ CRITICAL_SECTION Big_fat_mutex; // coarse grained mutex for debugging purposes
|
|
|
b3fe293 |
#include <sched.h>
|
|
|
b3fe293 |
#endif
|
|
|
b3fe293 |
|
|
|
b3fe293 |
+#if !defined(CLONE_NEWPID)
|
|
|
b3fe293 |
+#define CLONE_NEWPID 0x20000000
|
|
|
b3fe293 |
+#endif
|
|
|
b3fe293 |
+
|
|
|
b3fe293 |
static const char* EMPTY_DESCRIP = "<NULL>";
|
|
|
b3fe293 |
|
|
|
b3fe293 |
// special errno values that may be returned from Create_Process
|
|
|
b3fe293 |
@@ -6566,7 +6571,9 @@ public:
|
|
|
b3fe293 |
m_affinity_mask(affinity_mask),
|
|
|
b3fe293 |
m_fs_remap(fs_remap),
|
|
|
b3fe293 |
m_wrote_tracking_gid(false),
|
|
|
b3fe293 |
- m_no_dprintf_allowed(false)
|
|
|
b3fe293 |
+ m_no_dprintf_allowed(false),
|
|
|
b3fe293 |
+ m_clone_newpid_pid(-1),
|
|
|
b3fe293 |
+ m_clone_newpid_ppid(-1)
|
|
|
b3fe293 |
{
|
|
|
b3fe293 |
}
|
|
|
b3fe293 |
|
|
|
b3fe293 |
@@ -6627,6 +6634,10 @@ private:
|
|
|
b3fe293 |
bool m_wrote_tracking_gid;
|
|
|
b3fe293 |
bool m_no_dprintf_allowed;
|
|
|
b3fe293 |
priv_state m_priv_state;
|
|
|
b3fe293 |
+ pid_t m_clone_newpid_pid;
|
|
|
b3fe293 |
+ pid_t m_clone_newpid_ppid;
|
|
|
b3fe293 |
+
|
|
|
b3fe293 |
+ pid_t fork(int);
|
|
|
b3fe293 |
};
|
|
|
b3fe293 |
|
|
|
b3fe293 |
enum {
|
|
|
b3fe293 |
@@ -6650,7 +6661,19 @@ pid_t CreateProcessForkit::clone_safe_getpid() {
|
|
|
b3fe293 |
// the pid of the parent process (presumably due to internal
|
|
|
b3fe293 |
// caching in libc). Therefore, use the syscall to get
|
|
|
b3fe293 |
// the answer directly.
|
|
|
b3fe293 |
- return syscall(SYS_getpid);
|
|
|
b3fe293 |
+
|
|
|
b3fe293 |
+ int retval = syscall(SYS_getpid);
|
|
|
b3fe293 |
+
|
|
|
b3fe293 |
+ // If we were fork'd with CLONE_NEWPID, we think our PID is 1.
|
|
|
b3fe293 |
+ // In this case, ask the parent!
|
|
|
b3fe293 |
+ if (retval == 1) {
|
|
|
b3fe293 |
+ if (m_clone_newpid_pid == -1) {
|
|
|
b3fe293 |
+ EXCEPT("getpid is 1!");
|
|
|
b3fe293 |
+ }
|
|
|
b3fe293 |
+ retval = m_clone_newpid_pid;
|
|
|
b3fe293 |
+ }
|
|
|
b3fe293 |
+
|
|
|
b3fe293 |
+ return retval;
|
|
|
b3fe293 |
#else
|
|
|
b3fe293 |
return ::getpid();
|
|
|
b3fe293 |
#endif
|
|
|
b3fe293 |
@@ -6659,12 +6682,115 @@ pid_t CreateProcessForkit::clone_safe_getppid() {
|
|
|
b3fe293 |
#if HAVE_CLONE
|
|
|
b3fe293 |
// See above comment for clone_safe_getpid() for explanation of
|
|
|
b3fe293 |
// why we need to do this.
|
|
|
b3fe293 |
- return syscall(SYS_getppid);
|
|
|
b3fe293 |
+
|
|
|
b3fe293 |
+ int retval = syscall(SYS_getppid);
|
|
|
b3fe293 |
+
|
|
|
b3fe293 |
+ // If ppid is 0, then either Condor is init (DEAR GOD) or we
|
|
|
b3fe293 |
+ // were created with CLONE_NEWPID; ask the parent!
|
|
|
b3fe293 |
+ if (retval == 0) {
|
|
|
b3fe293 |
+ if (m_clone_newpid_ppid == -1) {
|
|
|
b3fe293 |
+ EXCEPT("getppid is 0!");
|
|
|
b3fe293 |
+ }
|
|
|
b3fe293 |
+ retval = m_clone_newpid_ppid;
|
|
|
b3fe293 |
+ }
|
|
|
b3fe293 |
+
|
|
|
b3fe293 |
+ return retval;
|
|
|
b3fe293 |
#else
|
|
|
b3fe293 |
return ::getppid();
|
|
|
b3fe293 |
#endif
|
|
|
b3fe293 |
}
|
|
|
b3fe293 |
|
|
|
b3fe293 |
+/**
|
|
|
b3fe293 |
+ * fork allows one to use certain clone syscall flags, but provides more
|
|
|
b3fe293 |
+ * familiar POSIX fork semantics.
|
|
|
b3fe293 |
+ * NOTES:
|
|
|
b3fe293 |
+ * - We whitelist the flags you are allowed to pass. Currently supported:
|
|
|
b3fe293 |
+ * - CLONE_NEWPID. Implies CLONE_NEWNS.
|
|
|
b3fe293 |
+ * If the clone succeeds but the remount fails, the child calls _exit(1),
|
|
|
b3fe293 |
+ * but the parent will return successfully.
|
|
|
b3fe293 |
+ * It would be a simple fix to have the parent return the failure, if
|
|
|
b3fe293 |
+ * someone desired.
|
|
|
b3fe293 |
+ * Flags are whitelisted to help us adhere to the fork-like semantics (no
|
|
|
b3fe293 |
+ * shared memory between parent and child, for example). If you give other
|
|
|
b3fe293 |
+ * flags, they are silently ignored.
|
|
|
b3fe293 |
+ * - man pages indicate that clone on i386 is only fully functional when used
|
|
|
b3fe293 |
+ * via ASM, not the vsyscall interface. This doesn't appear to be relevant
|
|
|
b3fe293 |
+ * to this particular use case.
|
|
|
b3fe293 |
+ * - To avoid linking with pthreads (or copy/pasting lots of glibc code), I
|
|
|
b3fe293 |
+ * don't include integration with threads. This means various threading
|
|
|
b3fe293 |
+ * calls in the child may not function correctly (pre-exec; post-exec
|
|
|
b3fe293 |
+ * should be fine), and pthreads might not notice when the child exits.
|
|
|
b3fe293 |
+ * Traditional POSIX calls like wait will still function because the
|
|
|
b3fe293 |
+ * parent will receive the SIGCHLD.
|
|
|
b3fe293 |
+ * This is simple to fix if someone desired, but I'd mostly rather not link
|
|
|
b3fe293 |
+ * with pthreads.
|
|
|
b3fe293 |
+ */
|
|
|
b3fe293 |
+
|
|
|
b3fe293 |
+#define ALLOWED_FLAGS (SIGCHLD | CLONE_NEWPID | CLONE_NEWNS )
|
|
|
b3fe293 |
+
|
|
|
b3fe293 |
+pid_t CreateProcessForkit::fork(int flags) {
|
|
|
b3fe293 |
+
|
|
|
b3fe293 |
+ // If you don't need any fancy flags, just do the old boring POSIX call
|
|
|
b3fe293 |
+ if (flags == 0) {
|
|
|
b3fe293 |
+ return ::fork();
|
|
|
b3fe293 |
+ }
|
|
|
b3fe293 |
+
|
|
|
b3fe293 |
+#if HAVE_CLONE
|
|
|
b3fe293 |
+
|
|
|
b3fe293 |
+ int rw[2]; // Communication pipes for the CLONE_NEWPID case.
|
|
|
b3fe293 |
+
|
|
|
b3fe293 |
+ flags |= SIGCHLD; // The only necessary flag.
|
|
|
b3fe293 |
+ if (flags & CLONE_NEWPID) {
|
|
|
b3fe293 |
+ flags |= CLONE_NEWNS;
|
|
|
b3fe293 |
+ if (pipe(rw)) {
|
|
|
b3fe293 |
+ EXCEPT("UNABLE TO CREATE PIPE.");
|
|
|
b3fe293 |
+ }
|
|
|
b3fe293 |
+ }
|
|
|
b3fe293 |
+
|
|
|
b3fe293 |
+ // fork as root if we have our fancy flags.
|
|
|
b3fe293 |
+ priv_state orig_state = set_priv(PRIV_ROOT);
|
|
|
b3fe293 |
+ int retval = syscall(SYS_clone, ALLOWED_FLAGS & flags, 0, NULL, NULL);
|
|
|
b3fe293 |
+
|
|
|
b3fe293 |
+ // Child
|
|
|
b3fe293 |
+ if ((retval == 0) && (flags & CLONE_NEWPID)) {
|
|
|
b3fe293 |
+
|
|
|
b3fe293 |
+ // If we should have forked as non-root, make things in life final.
|
|
|
b3fe293 |
+ set_priv(orig_state);
|
|
|
b3fe293 |
+
|
|
|
b3fe293 |
+ if (full_read(rw[0], &m_clone_newpid_ppid, sizeof(pid_t)) != sizeof(pid_t)) {
|
|
|
b3fe293 |
+ EXCEPT("Unable to write into pipe.");
|
|
|
b3fe293 |
+ }
|
|
|
b3fe293 |
+ if (full_read(rw[0], &m_clone_newpid_pid, sizeof(pid_t)) != sizeof(pid_t)) {
|
|
|
b3fe293 |
+ EXCEPT("Unable to write into pipe.");
|
|
|
b3fe293 |
+ }
|
|
|
b3fe293 |
+
|
|
|
b3fe293 |
+ // Parent
|
|
|
b3fe293 |
+ } else if (retval > 0) {
|
|
|
b3fe293 |
+ set_priv(orig_state);
|
|
|
b3fe293 |
+ pid_t ppid = getpid(); // We are parent, so don't need clone_safe_pid.
|
|
|
b3fe293 |
+ if (full_write(rw[1], &ppid, sizeof(ppid)) != sizeof(ppid)) {
|
|
|
b3fe293 |
+ EXCEPT("Unable to write into pipe.");
|
|
|
b3fe293 |
+ }
|
|
|
b3fe293 |
+ if (full_write(rw[1], &retval, sizeof(ppid)) != sizeof(ppid)) {
|
|
|
b3fe293 |
+ EXCEPT("Unable to write into pipe.");
|
|
|
b3fe293 |
+ }
|
|
|
b3fe293 |
+ }
|
|
|
b3fe293 |
+ // retval=-1 falls through here.
|
|
|
b3fe293 |
+ if (flags & CLONE_NEWPID) {
|
|
|
b3fe293 |
+ close(rw[0]);
|
|
|
b3fe293 |
+ close(rw[1]);
|
|
|
b3fe293 |
+ }
|
|
|
b3fe293 |
+ return retval;
|
|
|
b3fe293 |
+
|
|
|
b3fe293 |
+#else
|
|
|
b3fe293 |
+
|
|
|
b3fe293 |
+ // Note we silently ignore flags if there's no clone on the platform.
|
|
|
b3fe293 |
+ return ::fork();
|
|
|
b3fe293 |
+
|
|
|
b3fe293 |
+#endif
|
|
|
b3fe293 |
+
|
|
|
b3fe293 |
+}
|
|
|
b3fe293 |
+
|
|
|
b3fe293 |
pid_t CreateProcessForkit::fork_exec() {
|
|
|
b3fe293 |
pid_t newpid;
|
|
|
b3fe293 |
|
|
|
b3fe293 |
@@ -6736,7 +6862,11 @@ pid_t CreateProcessForkit::fork_exec() {
|
|
|
b3fe293 |
}
|
|
|
b3fe293 |
#endif /* HAVE_CLONE */
|
|
|
b3fe293 |
|
|
|
b3fe293 |
- newpid = fork();
|
|
|
b3fe293 |
+ int fork_flags = 0;
|
|
|
b3fe293 |
+ if (m_family_info) {
|
|
|
b3fe293 |
+ fork_flags |= m_family_info->want_pid_namespace ? CLONE_NEWPID : 0;
|
|
|
b3fe293 |
+ }
|
|
|
b3fe293 |
+ newpid = this->fork(fork_flags);
|
|
|
b3fe293 |
if( newpid == 0 ) {
|
|
|
b3fe293 |
// in child
|
|
|
b3fe293 |
enterCreateProcessChild(this);
|
|
|
b3fe293 |
diff --git a/src/condor_starter.V6.1/vanilla_proc.cpp b/src/condor_starter.V6.1/vanilla_proc.cpp
|
|
|
b3fe293 |
index 044cb10..8528ca7 100644
|
|
|
b3fe293 |
--- a/src/condor_starter.V6.1/vanilla_proc.cpp
|
|
|
b3fe293 |
+++ b/src/condor_starter.V6.1/vanilla_proc.cpp
|
|
|
b3fe293 |
@@ -360,6 +360,24 @@ VanillaProc::StartJob()
|
|
|
b3fe293 |
}
|
|
|
b3fe293 |
}
|
|
|
b3fe293 |
|
|
|
b3fe293 |
+#if defined(LINUX)
|
|
|
b3fe293 |
+ // On Linux kernel 2.6.24 and later, we can give each
|
|
|
b3fe293 |
+ // job its own PID namespace
|
|
|
b3fe293 |
+ if (param_boolean("USE_PID_NAMESPACES", false)) {
|
|
|
b3fe293 |
+ if (!can_switch_ids()) {
|
|
|
b3fe293 |
+ EXCEPT("USE_PID_NAMESPACES enabled, but can't perform this "
|
|
|
b3fe293 |
+ "call in Linux unless running as root.");
|
|
|
b3fe293 |
+ }
|
|
|
b3fe293 |
+ fi.want_pid_namespace = true;
|
|
|
b3fe293 |
+ if (!fs_remap) {
|
|
|
b3fe293 |
+ fs_remap = new FilesystemRemap();
|
|
|
b3fe293 |
+ }
|
|
|
b3fe293 |
+ fs_remap->RemapProc();
|
|
|
b3fe293 |
+ }
|
|
|
b3fe293 |
+ dprintf(D_FULLDEBUG, "PID namespace option: %s\n", fi.want_pid_namespace ? "true" : "false");
|
|
|
b3fe293 |
+#endif
|
|
|
b3fe293 |
+
|
|
|
b3fe293 |
+
|
|
|
b3fe293 |
// have OsProc start the job
|
|
|
b3fe293 |
//
|
|
|
b3fe293 |
int retval = OsProc::StartJob(&fi, fs_remap);
|
|
|
b3fe293 |
diff --git a/src/condor_utils/filesystem_remap.cpp b/src/condor_utils/filesystem_remap.cpp
|
|
|
b3fe293 |
index e0f2e61..735c744 100644
|
|
|
b3fe293 |
--- a/src/condor_utils/filesystem_remap.cpp
|
|
|
b3fe293 |
+++ b/src/condor_utils/filesystem_remap.cpp
|
|
|
b3fe293 |
@@ -29,7 +29,8 @@
|
|
|
b3fe293 |
|
|
|
b3fe293 |
FilesystemRemap::FilesystemRemap() :
|
|
|
b3fe293 |
m_mappings(),
|
|
|
b3fe293 |
- m_mounts_shared()
|
|
|
b3fe293 |
+ m_mounts_shared(),
|
|
|
b3fe293 |
+ m_remap_proc(false)
|
|
|
b3fe293 |
{
|
|
|
b3fe293 |
ParseMountinfo();
|
|
|
b3fe293 |
}
|
|
|
b3fe293 |
@@ -120,6 +121,9 @@ int FilesystemRemap::PerformMappings() {
|
|
|
b3fe293 |
break;
|
|
|
b3fe293 |
}
|
|
|
b3fe293 |
}
|
|
|
b3fe293 |
+ if ((!retval) && m_remap_proc) {
|
|
|
b3fe293 |
+ retval = mount("proc", "/proc", "proc", 0, NULL);
|
|
|
b3fe293 |
+ }
|
|
|
b3fe293 |
#endif
|
|
|
b3fe293 |
return retval;
|
|
|
b3fe293 |
}
|
|
|
b3fe293 |
@@ -148,6 +152,10 @@ std::string FilesystemRemap::RemapDir(std::string target) {
|
|
|
b3fe293 |
return target;
|
|
|
b3fe293 |
}
|
|
|
b3fe293 |
|
|
|
b3fe293 |
+void FilesystemRemap::RemapProc() {
|
|
|
b3fe293 |
+ m_remap_proc = true;
|
|
|
b3fe293 |
+}
|
|
|
b3fe293 |
+
|
|
|
b3fe293 |
/*
|
|
|
b3fe293 |
Sample mountinfo contents (from http://www.kernel.org/doc/Documentation/filesystems/proc.txt):
|
|
|
b3fe293 |
36 35 98:0 /mnt1 /mnt2 rw,noatime master:1 - ext3 /dev/root rw,errors=continue
|
|
|
b3fe293 |
diff --git a/src/condor_utils/filesystem_remap.h b/src/condor_utils/filesystem_remap.h
|
|
|
b3fe293 |
index 5e9362d..2e17476 100644
|
|
|
b3fe293 |
--- a/src/condor_utils/filesystem_remap.h
|
|
|
b3fe293 |
+++ b/src/condor_utils/filesystem_remap.h
|
|
|
b3fe293 |
@@ -74,6 +74,12 @@ public:
|
|
|
b3fe293 |
*/
|
|
|
b3fe293 |
std::string RemapFile(std::string);
|
|
|
b3fe293 |
|
|
|
b3fe293 |
+ /**
|
|
|
b3fe293 |
+ * Indicate that we should remount /proc in the child process.
|
|
|
b3fe293 |
+ * Necessary for PID namespaces.
|
|
|
b3fe293 |
+ */
|
|
|
b3fe293 |
+ void RemapProc();
|
|
|
b3fe293 |
+
|
|
|
b3fe293 |
private:
|
|
|
b3fe293 |
|
|
|
b3fe293 |
/**
|
|
|
b3fe293 |
@@ -89,6 +95,7 @@ private:
|
|
|
b3fe293 |
std::list<pair_strings> m_mappings;
|
|
|
b3fe293 |
std::list<pair_str_bool> m_mounts_shared;
|
|
|
b3fe293 |
std::list<pair_strings> m_mounts_autofs;
|
|
|
b3fe293 |
+ bool m_remap_proc;
|
|
|
b3fe293 |
|
|
|
b3fe293 |
};
|
|
|
b3fe293 |
#endif
|