executor: new namespace-based sandbox

2024-11-25 12:29:43 +00:00 · 2016-01-22 18:09:32 +01:00 · 2016-01-22 18:09:32 +01:00 · 1e06d2bafc
commit 1e06d2bafc
parent 3247604265
4 changed files with 209 additions and 89 deletions
--- a/executor/executor.cc
+++ b/executor/executor.cc
@ -2,6 +2,7 @@
 // Use of this source code is governed by Apache 2 LICENSE that can be found in the LICENSE file.

 #include <algorithm>
+#include <dirent.h>
 #include <errno.h>
 #include <fcntl.h>
 #include <grp.h>
@ -18,6 +19,7 @@
 #include <string.h>
 #include <sys/ioctl.h>
 #include <sys/mman.h>
+#include <sys/mount.h>
 #include <sys/prctl.h>
 #include <sys/reboot.h>
 #include <sys/resource.h>
@ -69,7 +71,6 @@ bool flag_threaded;
 bool flag_collide;
 bool flag_deduplicate;
 bool flag_drop_privs;
-bool flag_no_setpgid;

 __attribute__((aligned(64 << 10))) char input_data[kMaxInput];
 __attribute__((aligned(64 << 10))) char output_data[kMaxOutput];
@ -77,6 +78,8 @@ uint32_t* output_pos;
 int completed;
 int running;
 bool collide;
+int real_uid;
+int real_gid;

 struct res_t {
 	bool executed;
@ -87,7 +90,6 @@ res_t results[kMaxCommands];

 struct thread_t {
 	bool created;
-	bool root;
 	int id;
 	pthread_t th;
 	uintptr_t* cover_data;
@ -107,11 +109,13 @@ struct thread_t {
 };

 thread_t threads[kMaxThreads];
+char loop_stack[1 << 20];

 __attribute__((noreturn)) void fail(const char* msg, ...);
 __attribute__((noreturn)) void error(const char* msg, ...);
 __attribute__((noreturn)) void exitf(const char* msg, ...);
 void debug(const char* msg, ...);
+int loop(void* arg);
 void execute_one();
 uint64_t read_input(uint64_t** input_posp, bool peek = false);
 uint64_t read_arg(uint64_t** input_posp);
@ -122,8 +126,11 @@ uint64_t copyout(char* addr, uint64_t size);
 thread_t* schedule_call(int n, int call_index, int call_num, uint64_t num_args, uint64_t* args, uint64_t* pos);
 void execute_call(thread_t* th);
 void handle_completion(thread_t* th);
-void thread_create(thread_t* th, int id, bool root);
+void thread_create(thread_t* th, int id);
 void* worker_thread(void* arg);
+void sandbox();
+bool write_file(const char* file, const char* what, ...);
+void remove_dir(const char* dir);
 uint64_t current_time_ms();
 void cover_open();
 void cover_enable(thread_t* th);
@ -138,6 +145,7 @@ int main(int argc, char** argv)
 		return 0;
 	}

+	prctl(PR_SET_PDEATHSIG, SIGKILL, 0, 0, 0);
 	if (mmap(&input_data[0], kMaxInput, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_FIXED, kInFd, 0) != &input_data[0])
 		fail("mmap of input file failed");
 	if (mmap(&output_data[0], kMaxOutput, PROT_READ | PROT_WRITE, MAP_SHARED | MAP_FIXED, kOutFd, 0) != &output_data[0])
@ -148,8 +156,6 @@ int main(int argc, char** argv)
 	// That's also the reason why we close kInPipeFd/kOutPipeFd below.
 	close(kInFd);
 	close(kOutFd);
-	char cwdbuf[64 << 10];
-	char* cwd = getcwd(cwdbuf, sizeof(cwdbuf));

 	uint64_t flags = *(uint64_t*)input_data;
 	flag_debug = flags & (1 << 0);
@ -158,68 +164,70 @@ int main(int argc, char** argv)
 	flag_collide = flags & (1 << 3);
 	flag_deduplicate = flags & (1 << 4);
 	flag_drop_privs = flags & (1 << 5);
-	flag_no_setpgid = flags & (1 << 6);
 	if (!flag_threaded)
 		flag_collide = false;

 	cover_open();

-	// Do some sandboxing in parent process.
-	struct rlimit rlim;
-	rlim.rlim_cur = rlim.rlim_max = 64 << 20;
-	setrlimit(RLIMIT_AS, &rlim);
-	rlim.rlim_cur = rlim.rlim_max = 1 << 20;
-	setrlimit(RLIMIT_FSIZE, &rlim);
-	rlim.rlim_cur = rlim.rlim_max = 0;
-	setrlimit(RLIMIT_CORE, &rlim);
+	// Don't need that SIGCANCEL/SIGSETXID glibc stuff.
+	// SIGCANCEL sent to main thread causes it to exit
+	// without bringing down the whole group.
+	struct sigaction sa;
+	memset(&sa, 0, sizeof(sa));
+	sa.sa_handler = SIG_IGN;
+	syscall(SYS_rt_sigaction, 0x20, &sa, NULL, 8);
+	syscall(SYS_rt_sigaction, 0x21, &sa, NULL, 8);
+
+	real_uid = getuid();
+	real_gid = getgid();
+
+	int pid = clone(loop, &loop_stack[sizeof(loop_stack) - 8],
+			CLONE_NEWUSER | CLONE_NEWNS | CLONE_NEWPID | CLONE_NEWUTS | CLONE_NEWNET, NULL);
+	if (pid < 0)
+		fail("clone failed");
+	debug("spawned loop pid %d\n", pid);
+	int status = 0;
+	while (waitpid(pid, &status, __WALL) != pid) {
+	}
+	status = WEXITSTATUS(status);
+	if (status == kFailStatus)
+		fail("loop failed");
+	if (status == kErrorStatus)
+		error("loop errored");
+	fail("loop exited with status %d", status);
+	return 0;
+}
+
+int loop(void* arg)
+{
+	sandbox();
+
+	for (int iter = 0;; iter++) {
+		// Create a new private work dir for this test (removed at the end of the loop).
+		char cwdbuf[256];
+		sprintf(cwdbuf, "/%d", iter);
+		if (mkdir(cwdbuf, 0777))
+			fail("failed to mkdir");
+		if (chdir(cwdbuf))
+			fail("failed to chdir");

-	for (;;) {
 		char tmp;
 		if (read(kInPipeFd, &tmp, 1) != 1)
 			fail("control pipe read failed");
-		debug("received start command\n");
-		// The dir may have been recreated.
-		if (chdir(cwd))
-			fail("failed to chdir");

 		int pid = fork();
 		if (pid < 0)
-			fail("fork failed");
+			fail("clone failed");
 		if (pid == 0) {
 			prctl(PR_SET_PDEATHSIG, SIGKILL, 0, 0, 0);
-			if (!flag_no_setpgid)
-				setpgid(0, 0);
-			unshare(CLONE_NEWNS);
+			setpgrp();
 			close(kInPipeFd);
 			close(kOutPipeFd);
-			if (flag_drop_privs) {
-				// Pre-create one thread with root privileges for execution of special syscalls (e.g. mount).
-				if (flag_threaded)
-					thread_create(&threads[kMaxThreads - 1], kMaxThreads - 1, true);
-				// TODO: 65534 is meant to be nobody
-				if (setgroups(0, NULL))
-					fail("failed to setgroups");
-				// glibc versions do not we want -- they force all threads to setuid.
-				// We want to preserve the thread above as root.
-				if (syscall(SYS_setresgid, 65534, 65534, 65534))
-					fail("failed to setresgid");
-				if (syscall(SYS_setresuid, 65534, 65534, 65534))
-					fail("failed to setresuid");
-			}
-			// Don't need that SIGCANCEL/SIGSETXID glibc stuff.
-			// SIGCANCEL sent to main thread causes it to exit
-			// without bringing down the whole group.
-			struct sigaction sa;
-			memset(&sa, 0, sizeof(sa));
-			sa.sa_handler = SIG_IGN;
-			syscall(SYS_rt_sigaction, 0x20, &sa, NULL, 8);
-			syscall(SYS_rt_sigaction, 0x21, &sa, NULL, 8);
-
 			execute_one();
-
-			debug("exiting\n");
-			return 0;
+			debug("worker exiting\n");
+			exit(0);
 		}
+		debug("spawned worker pid %d\n", pid);

 		// We used to use sigtimedwait(SIGCHLD) to wait for the subprocess.
 		// But SIGCHLD is also delivered when a process stops/continues,
@ -253,11 +261,67 @@ int main(int argc, char** argv)
 			fail("child failed");
 		if (status == kErrorStatus)
 			error("child errored");
+		remove_dir(cwdbuf);
 		if (write(kOutPipeFd, &tmp, 1) != 1)
 			fail("control pipe write failed");
 	}
 }

+void sandbox()
+{
+	prctl(PR_SET_PDEATHSIG, SIGKILL, 0, 0, 0);
+	setpgrp();
+	setsid();
+
+	struct rlimit rlim;
+	rlim.rlim_cur = rlim.rlim_max = 128 << 20;
+	setrlimit(RLIMIT_AS, &rlim);
+	rlim.rlim_cur = rlim.rlim_max = 1 << 20;
+	setrlimit(RLIMIT_FSIZE, &rlim);
+	rlim.rlim_cur = rlim.rlim_max = 1 << 20;
+	setrlimit(RLIMIT_STACK, &rlim);
+	rlim.rlim_cur = rlim.rlim_max = 0;
+	setrlimit(RLIMIT_CORE, &rlim);
+
+	// CLONE_NEWIPC/CLONE_IO cause EINVAL on some systems, so we do them separately of clone.
+	unshare(CLONE_NEWIPC);
+	unshare(CLONE_IO);
+
+	// /proc/self/setgroups is not present on some systems, ignore error.
+	write_file("/proc/self/setgroups", "deny");
+	if (!write_file("/proc/self/uid_map", "0 %d 1\n", real_uid))
+		fail("write of /proc/self/uid_map failed");
+	if (!write_file("/proc/self/gid_map", "0 %d 1\n", real_gid))
+		fail("write of /proc/self/gid_map failed");
+
+	if (mkdir("./syz-tmp", 0777))
+		fail("mkdir(syz-tmp) failed");
+	if (mount("", "./syz-tmp", "tmpfs", 0, NULL))
+		fail("mount(tmpfs) failed");
+	if (mkdir("./syz-tmp/newroot", 0777))
+		fail("mkdir failed");
+	if (mkdir("./syz-tmp/newroot/dev", 0700))
+		fail("mkdir failed");
+	if (mount("/dev", "./syz-tmp/newroot/dev", NULL, MS_BIND | MS_REC | MS_PRIVATE, NULL))
+		fail("mount(dev) failed");
+	if (mkdir("./syz-tmp/pivot", 0777))
+		fail("mkdir failed");
+	if (syscall(SYS_pivot_root, "./syz-tmp", "./syz-tmp/pivot")) {
+		debug("pivot_root failed\n");
+		if (chdir("./syz-tmp"))
+			fail("chdir failed");
+	} else {
+		if (chdir("/"))
+			fail("chdir failed");
+		if (umount2("./pivot", MNT_DETACH))
+			fail("umount failed");
+	}
+	if (chroot("./newroot"))
+		fail("chroot failed");
+	if (chdir("/"))
+		fail("chdir failed");
+}
+
 void execute_one()
 {
 retry:
@ -374,34 +438,15 @@ retry:

 thread_t* schedule_call(int n, int call_index, int call_num, uint64_t num_args, uint64_t* args, uint64_t* pos)
 {
-	// Figure out whether we need root privs for this call.
-	bool root = false;
-	switch (syscalls[call_num].sys_nr) {
-	case __NR_mount:
-	case __NR_umount2:
-	case __NR_syz_open_dev:
-	case __NR_syz_fuse_mount:
-	case __NR_syz_fuseblk_mount:
-		root = true;
-	default:
-		// Lots of dri ioctls require root.
-		// There are some generic permission checks that hopefully don't contain bugs,
-		// so let's just execute all them under root.
-		if (strncmp(syscalls[call_num].name, "ioctl$DRM", sizeof("ioctl$DRM")) == 0)
-			root = true;
-	}
-
 	// Find a spare thread to execute the call.
 	int i;
 	for (i = 0; i < kMaxThreads; i++) {
 		thread_t* th = &threads[i];
-		if (!th->created && (!flag_drop_privs || root == th->root))
-			thread_create(th, i, false);
+		if (!th->created)
+			thread_create(th, i);
 		if (__atomic_load_n(&th->done, __ATOMIC_ACQUIRE)) {
 			if (!th->handled)
 				handle_completion(th);
-			if (flag_drop_privs && root != th->root)
-				continue;
 			break;
 		}
 	}
@ -470,16 +515,19 @@ void handle_completion(thread_t* th)
 	running--;
 }

-void thread_create(thread_t* th, int id, bool root)
+void thread_create(thread_t* th, int id)
 {
 	th->created = true;
 	th->id = id;
-	th->root = root;
 	th->done = true;
 	th->handled = true;
 	if (flag_threaded) {
-		if (pthread_create(&th->th, 0, worker_thread, th))
+		pthread_attr_t attr;
+		pthread_attr_init(&attr);
+		pthread_attr_setstacksize(&attr, 128 << 10);
+		if (pthread_create(&th->th, &attr, worker_thread, th))
 			exitf("pthread_create failed");
+		pthread_attr_destroy(&attr);
 	}
 }

@ -768,6 +816,93 @@ void write_output(uint32_t v)
 	*output_pos++ = v;
 }

+bool write_file(const char* file, const char* what, ...)
+{
+	char buf[1024];
+	va_list args;
+	va_start(args, what);
+	vsnprintf(buf, sizeof(buf), what, args);
+	va_end(args);
+	buf[sizeof(buf) - 1] = 0;
+	int len = strlen(buf);
+
+	int fd = open(file, O_WRONLY | O_CLOEXEC);
+	if (fd == -1)
+		return false;
+	if (write(fd, buf, len) != len) {
+		close(fd);
+		return false;
+	}
+	close(fd);
+	return true;
+}
+
+// One does not simply remove a directory.
+// There can be mounts, so we need to try to umount.
+// Moreover, a mount can be mounted several times, so we need to try to umount in a loop.
+// Moreover, after umount a dir can become non-empty again, so we need another loop.
+// Moreover, a mount can be re-mounted as read-only and then we will fail to make a dir empty.
+void remove_dir(const char* dir)
+{
+	int iter = 0;
+retry:
+	DIR* dp = opendir(dir);
+	if (dp == NULL)
+		fail("opendir(%s) failed", dir);
+	while (dirent* ep = readdir(dp)) {
+		if (strcmp(ep->d_name, ".") == 0 || strcmp(ep->d_name, "..") == 0)
+			continue;
+		char filename[FILENAME_MAX];
+		snprintf(filename, sizeof(filename), "%s/%s", dir, ep->d_name);
+		struct stat st;
+		if (lstat(filename, &st))
+			fail("lstat(%s) failed", filename);
+		if (S_ISDIR(st.st_mode)) {
+			remove_dir(filename);
+			continue;
+		}
+		for (int i = 0;; i++) {
+			debug("unlink(%s)\n", filename);
+			if (unlink(filename) == 0)
+				break;
+			if (errno == EROFS) {
+				debug("ignoring EROFS\n");
+				break;
+			}
+			if (errno != EBUSY || i > 100)
+				fail("unlink(%s) failed", filename);
+			debug("umount(%s)\n", filename);
+			if (umount2(filename, MNT_DETACH))
+				fail("umount(%s) failed", filename);
+		}
+	}
+	closedir(dp);
+	for (int i = 0;; i++) {
+		debug("rmdir(%s)\n", dir);
+		if (rmdir(dir) == 0)
+			break;
+		if (i < 100) {
+			if (errno == EROFS) {
+				debug("ignoring EROFS\n");
+				break;
+			}
+			if (errno == EBUSY) {
+				debug("umount(%s)\n", dir);
+				if (umount2(dir, MNT_DETACH))
+					fail("umount(%s) failed", dir);
+				continue;
+			}
+			if (errno == ENOTEMPTY) {
+				if (iter < 100) {
+					iter++;
+					goto retry;
+				}
+			}
+		}
+		fail("rmdir(%s) failed", dir);
+	}
+}
+
 uint64_t current_time_ms()
 {
 	timespec ts;
--- a/ipc/ipc.go
+++ b/ipc/ipc.go
@ -42,7 +42,6 @@ const (
 	FlagCollide                        // collide syscalls to provoke data races
 	FlagDedupCover                     // deduplicate coverage in executor
 	FlagDropPrivs                      // impersonate nobody user
-	FlagNoSetpgid                      // don't use setpgid
 	FlagStrace                         // run executor under strace
 )

@ -372,6 +371,7 @@ func (c *command) kill() {
 func (c *command) exec() (output, strace []byte, failed, hanged bool, err0 error) {
 	var tmp [1]byte
 	if _, err := c.outwp.Write(tmp[:]); err != nil {
+		output, _ = ioutil.ReadAll(c.rp)
 		err0 = fmt.Errorf("failed to write control pipe: %v", err)
 		return
 	}
@ -391,13 +391,6 @@ func (c *command) exec() (output, strace []byte, failed, hanged bool, err0 error
 	//!!! handle c.rp overflow
 	_, readErr := c.inrp.Read(tmp[:])
 	close(done)
-	fileutil.UmountAll(c.dir)
-	os.RemoveAll(c.dir)
-	if err := os.Mkdir(c.dir, 0777); err != nil {
-		<-hang
-		err0 = fmt.Errorf("failed to create temp dir: %v", err)
-		return
-	}
 	if readErr == nil {
 		<-hang
 		return
--- a/tools/syz-execprog/execprog.go
+++ b/tools/syz-execprog/execprog.go
@ -33,7 +33,6 @@ var (
 	flagDedup     = flag.Bool("dedup", false, "deduplicate coverage in executor")
 	flagRepeat    = flag.Int("repeat", 1, "repeat execution that many times (0 for infinite loop)")
 	flagProcs     = flag.Int("procs", 1, "number of parallel processes to execute programs")
-	flagNoPgid    = flag.Bool("nopgid", false, "don't use setpgid syscall")
 	flagTimeout   = flag.Duration("timeout", 10*time.Second, "execution timeout")
 )

@ -83,9 +82,6 @@ func main() {
 	if *flagNobody {
 		flags |= ipc.FlagDropPrivs
 	}
-	if *flagNoPgid {
-		flags |= ipc.FlagNoSetpgid
-	}

 	var wg sync.WaitGroup
 	wg.Add(*flagProcs)
--- a/tools/syz-stress/stress.go
+++ b/tools/syz-stress/stress.go
@ -32,7 +32,6 @@ var (
 	flagThreaded = flag.Bool("threaded", true, "use threaded mode in executor")
 	flagCollide  = flag.Bool("collide", true, "collide syscalls to provoke data races")
 	flagNobody   = flag.Bool("nobody", true, "impersonate into nobody")
-	flagNoPgid   = flag.Bool("nopgid", false, "don't use setpgid syscall")
 	flagTimeout  = flag.Duration("timeout", 10*time.Second, "executor timeout")
 	flagLogProg  = flag.Bool("logprog", false, "print programs before execution")

@ -66,9 +65,6 @@ func main() {
 	if *flagDebug {
 		flags |= ipc.FlagDebug
 	}
-	if *flagNoPgid {
-		flags |= ipc.FlagNoSetpgid
-	}

 	gate = ipc.NewGate(2 * *flagProcs)
 	for pid := 0; pid < *flagProcs; pid++ {