From cb8b6a8b8ea983844584d8ada4d9aa4c88c997fb Mon Sep 17 00:00:00 2001
From: Alastair Houghton <alastair@alastairs-place.net>
Date: Tue, 29 Dec 2020 14:02:39 +0000
Subject: [PATCH] cifs.upcall: try to use container ipc/uts/net/pid/mnt/user
 namespaces

In certain scenarios (e.g. kerberos multimount), when a process does
syscalls, the kernel sometimes has to query information or trigger
some actions in userspace. To do so it calls the cifs.upcall binary
with information on the process that triggered the syscall in the
first place.

ls(pid=10) ====> open("foo") ====> kernel

                                   that user doesn't have an SMB
                                   session, lets create one using his
                                   kerberos credential cache

                                   call cifs.upcall and ask for krb info
                                   for whoever owns pid=10
                                                         |
                  cifs.upcall --pid 10 <=================+

               ...gather info...
                  return binary blob used
                  when establishing SMB session
                        ===================> kernel
                                              open SMB session, handle
                                              open() syscall
ls <===================================   return open() result to ls

On a system using containers, the kernel is still calling the host
cifs.upcall and using the host configuration (for network, pid, etc).

This patch changes the behaviour of cifs.upcall so that it uses the
calling process namespaces (ls in the example) when doing its
job.

Note that the kernel still calls the binary in the host, but the
binary will place itself the contexts of the calling process
namespaces.

This code makes use of (but shouldn't require) the following kernel
config options and syscall flags:

approx. year   |
introduced     |  config/flags
---------------+----------------
2008           | CONFIG_NAMESPACES=y
2007           | CONFIG_UTS_NS=y
2020           | CONFIG_TIME_NS=y
2006           | CONFIG_IPC_NS=y
2007           | CONFIG_USER_NS
2008           | CONFIG_PID_NS=y
2007           | CONFIG_NET_NS=y
2007           | CONFIG_CGROUPS
2016           | CLONE_NEWCGROUP setns() flag

Signed-off-by: Aurelien Aptel <aaptel@suse.com>
Signed-off-by: Alastair Houghton <alastair@alastairs-place.net>
---
 cifs.upcall.c | 171 ++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 171 insertions(+)
diff --git a/cifs.upcall.c b/cifs.upcall.c
index 1559434..141dc66 100644
--- a/cifs.upcall.c
+++ b/cifs.upcall.c
@@ -51,6 +51,7 @@
 #include <grp.h>
 #include <stdbool.h>
 #include <errno.h>
+#include <sched.h>
 
 #include "data_blob.h"
 #include "spnego.h"
@@ -230,6 +231,164 @@ err_cache:
 	return credtime;
 }
 
+static struct namespace_file {
+	int nstype;
+	const char *name;
+	int fd;
+} namespace_files[] = {
+
+#ifdef CLONE_NEWCGROUP
+	{ CLONE_NEWCGROUP, "cgroup", -1 },
+#endif
+
+#ifdef CLONE_NEWIPC
+	{ CLONE_NEWIPC, "ipc", -1 },
+#endif
+
+#ifdef CLONE_NEWUTS
+	{ CLONE_NEWUTS, "uts", -1 },
+#endif
+
+#ifdef CLONE_NEWNET
+	{ CLONE_NEWNET, "net", -1 },
+#endif
+
+#ifdef CLONE_NEWPID
+	{ CLONE_NEWPID, "pid", -1 },
+#endif
+
+#ifdef CLONE_NEWTIME
+	{ CLONE_NEWTIME, "time", -1 },
+#endif
+
+#ifdef CLONE_NEWNS
+	{ CLONE_NEWNS, "mnt", -1 },
+#endif
+
+#ifdef CLONE_NEWUSER
+	{ CLONE_NEWUSER, "user", -1 },
+#endif
+};
+
+#define NS_PATH_FMT    "/proc/%d/ns/%s"
+#define NS_PATH_MAXLEN (6 + 10 + 4 + 6 + 1)
+
+/**
+ * in_same_user_ns - return true if two processes are in the same user
+ *                   namespace.
+ * @pid_a: the pid of the first process
+ * @pid_b: the pid of the second process
+ *
+ * Works by comparing the inode numbers for /proc/<pid>/user.
+ */
+static int
+in_same_user_ns(pid_t pid_a, pid_t pid_b)
+{
+	char path[NS_PATH_MAXLEN];
+	ino_t a_ino, b_ino;
+	struct stat st;
+
+	snprintf(path, sizeof(path), NS_PATH_FMT, pid_a, "user");
+	if (stat(path, &st) != 0)
+		return 0;
+	a_ino = st.st_ino;
+
+	snprintf(path, sizeof(path), NS_PATH_FMT, pid_b, "user");
+	if (stat(path, &st) != 0)
+		return 0;
+	b_ino = st.st_ino;
+
+	return a_ino == b_ino;
+}
+
+/**
+ * switch_to_process_ns - change the namespace to the one for the specified
+ *                        process.
+ * @pid: initiating pid value from the upcall string
+ *
+ * Uses setns() to switch process namespace.
+ * This ensures that we have the same access and configuration as the
+ * process that triggered the lookup.
+ */
+static int
+switch_to_process_ns(pid_t pid)
+{
+	int count = sizeof(namespace_files) / sizeof(struct namespace_file);
+	int n, err = 0;
+	int rc = 0;
+
+	/* First, open all the namespace fds.  We do this first because
+	   the namespace changes might prohibit us from opening them. */
+	for (n = 0; n < count; ++n) {
+		char nspath[NS_PATH_MAXLEN];
+		int ret, fd;
+
+#ifdef CLONE_NEWUSER
+		if (namespace_files[n].nstype == CLONE_NEWUSER
+		    && in_same_user_ns(getpid(), pid)) {
+			/* Switching to the same user namespace is forbidden,
+			   because switching to a user namespace grants all
+			   capabilities in that namespace regardless of uid. */
+			namespace_files[n].fd = -1;
+			continue;
+		}
+#endif
+
+		ret = snprintf(nspath, NS_PATH_MAXLEN, NS_PATH_FMT,
+			       pid, namespace_files[n].name);
+		if (ret >= NS_PATH_MAXLEN) {
+			syslog(LOG_DEBUG, "%s: unterminated path!\n", __func__);
+			err = ENAMETOOLONG;
+			rc = -1;
+			goto out;
+		}
+
+		fd = open(nspath, O_RDONLY);
+		if (fd < 0 && errno != ENOENT) {
+			/*
+			 * don't stop on non-existing ns
+			 * but stop for other errors
+			 */
+			err = errno;
+			rc = -1;
+			goto out;
+		}
+
+		namespace_files[n].fd = fd;
+	}
+
+	/* Next, call setns for each of them */
+	for (n = 0; n < count; ++n) {
+		/* skip non-existing ns */
+		if (namespace_files[n].fd < 0)
+			continue;
+
+		rc = setns(namespace_files[n].fd, namespace_files[n].nstype);
+
+		if (rc < 0) {
+			syslog(LOG_DEBUG, "%s: setns() failed for %s\n",
+			       __func__, namespace_files[n].name);
+			err = errno;
+			goto out;
+		}
+	}
+
+out:
+	/* Finally, close all the fds */
+	for (n = 0; n < count; ++n) {
+		if (namespace_files[n].fd != -1) {
+			close(namespace_files[n].fd);
+			namespace_files[n].fd = -1;
+		}
+	}
+
+	if (rc != 0) {
+		errno = err;
+	}
+
+	return rc;
+}
+
 #define	ENV_PATH_FMT			"/proc/%d/environ"
 #define	ENV_PATH_MAXLEN			(6 + 10 + 8 + 1)
 
@@ -1099,6 +1258,19 @@ int main(const int argc, char *const argv[])
 	env_cachename =
 		get_cachename_from_process_env(env_probe ? arg.pid : 0);
 
+	/*
+	 * Change to the process's namespace. This means that things will work
+	 * acceptably in containers, because we'll be looking at the correct
+	 * filesystem and have the correct network configuration.
+	 */
+	rc = switch_to_process_ns(arg.pid);
+	if (rc == -1) {
+		syslog(LOG_ERR, "unable to switch to process namespace: %s",
+		       strerror(errno));
+		rc = 1;
+		goto out;
+	}
+
 	rc = setuid(uid);
 	if (rc == -1) {
 		syslog(LOG_ERR, "setuid: %s", strerror(errno));