Bug 1440203 - Support memfd_create in IPC shared memory. r=glandium

This commit also allows `memfd_create` in the seccomp-bpf policy for all
process types.

`memfd_create` is an API added in Linux 3.17 (and adopted by FreeBSD
for the upcoming version 13) for creating anonymous shared memory
not connected to any filesystem.  Supporting it means that sandboxed
child processes on Linux can create shared memory directly instead of
messaging a broker, which is unavoidably slower, and it should avoid
the problems we'd been seeing with overly small `/dev/shm` in container
environments (which were causing serious problems for using Firefox for
automated testing of frontend projects).

`memfd_create` also introduces the related operation of file seals:
irrevocably preventing types of modifications to a file.  Unfortunately,
the most useful one, `F_SEAL_WRITE`, can't be relied on; see the large
comment in `SharedMemory:ReadOnlyCopy` for details.  So we still use
the applicable seals as defense in depth, but read-only copies are
implemented on Linux by using procfs (and see the comments on the
`ReadOnlyCopy` function in `shared_memory_posix.cc` for the subtleties
there).

There's also a FreeBSD implementation, using `cap_rights_limit` for
read-only copies, if the build host is new enough to have the
`memfd_create` function.

Differential Revision: https://phabricator.services.mozilla.com/D90605
This commit is contained in:
Jed Davis 2020-10-06 19:20:29 +00:00
parent 8d4ba1a2ad
commit c4968e6653
6 changed files with 330 additions and 51 deletions

View File

@ -65,6 +65,14 @@ check_headers(
'byteswap.h',
)
# memfd_create(2) -- Note that older versions of the Linux man-pages
# project incorrectly cite <sys/memfd.h>, which doesn't exist; this
# was fixed in the man-pages-5.00 release.
set_define('HAVE_MEMFD_CREATE',
try_compile(includes=['sys/mman.h'],
body='memfd_create("", 0);',
check_msg='for memfd_create in sys/mman.h'))
# TODO: Move these checks to file specific to --enable-project=js.
have_perf_event_h = check_header('linux/perf_event.h',
when=building_linux)

View File

@ -1349,5 +1349,10 @@ if CONFIG['OS_TARGET'] == 'Linux' and CONFIG['CPU_ARCH'].startswith('mips'):
'sys/cachectl.h',
]
if CONFIG['OS_TARGET'] == 'FreeBSD':
system_headers += [
'sys/capsicum.h',
]
if CONFIG['MOZ_APP_SYSTEM_HEADERS']:
include("../" + CONFIG['MOZ_BUILD_APP'] + "/app-system-headers.mozbuild")

View File

@ -0,0 +1,69 @@
/* -*- Mode: C++; tab-width: 8; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
/* vim: set ts=8 sts=2 et sw=2 tw=80: */
/* This Source Code Form is subject to the terms of the Mozilla Public
* License, v. 2.0. If a copy of the MPL was not distributed with this
* file, You can obtain one at https://mozilla.org/MPL/2.0/. */
#ifndef BASE_LINUX_MEMFD_DEFS_H
#define BASE_LINUX_MEMFD_DEFS_H
#include <sys/syscall.h>
// glibc before 2.27 didn't have a memfd_create wrapper, and if the
// build system is old enough then it won't have the syscall number
// and various related constants either.
#if defined(__x86_64__)
# define MEMFD_CREATE_NR 319
#elif defined(__i386__)
# define MEMFD_CREATE_NR 356
#elif defined(__aarch64__)
# define MEMFD_CREATE_NR 279
#elif defined(__arm__)
# define MEMFD_CREATE_NR 385
#elif defined(__powerpc__)
# define MEMFD_CREATE_NR 360
#elif defined(__s390__)
# define MEMFD_CREATE_NR 350
#elif defined(__mips__)
# include <sgidefs.h>
# if _MIPS_SIM == _MIPS_SIM_ABI32
# define MEMFD_CREATE_NR 4354
# elif _MIPS_SIM == _MIPS_SIM_ABI64
# define MEMFD_CREATE_NR 5314
# elif _MIPS_SIM == _MIPS_SIM_NABI32
# define MEMFD_CREATE_NR 6318
# endif // mips subarch
#endif // arch
#ifdef MEMFD_CREATE_NR
# ifdef SYS_memfd_create
static_assert(MEMFD_CREATE_NR == SYS_memfd_create,
"MEMFD_CREATE_NR should match the actual SYS_memfd_create value");
# else // defined here but not in system headers
# define SYS_memfd_create MEMFD_CREATE_NR
# endif
#endif
#ifndef MFD_CLOEXEC
# define MFD_CLOEXEC 0x0001U
# define MFD_ALLOW_SEALING 0x0002U
#endif
#ifndef F_ADD_SEALS
# ifndef F_LINUX_SPECIFIC_BASE
# define F_LINUX_SPECIFIC_BASE 1024
# endif
# define F_ADD_SEALS (F_LINUX_SPECIFIC_BASE + 9)
# define F_GET_SEALS (F_LINUX_SPECIFIC_BASE + 10)
# define F_SEAL_SEAL 0x0001 /* prevent further seals from being set */
# define F_SEAL_SHRINK 0x0002 /* prevent file from shrinking */
# define F_SEAL_GROW 0x0004 /* prevent file from growing */
# define F_SEAL_WRITE 0x0008 /* prevent writes */
#endif
#ifndef F_SEAL_FUTURE_WRITE
# define F_SEAL_FUTURE_WRITE 0x0010
#endif
#endif // BASE_LINUX_MEMFD_DEFS_H

View File

@ -218,6 +218,7 @@ class SharedMemory {
bool external_section_ = false;
#elif defined(OS_POSIX)
mozilla::UniqueFileHandle frozen_file_;
bool is_memfd_ = false;
#endif
bool read_only_ = false;
bool freezeable_ = false;

View File

@ -16,6 +16,14 @@
# include "mozilla/Ashmem.h"
#endif
#ifdef OS_LINUX
# include "linux_memfd_defs.h"
#endif
#ifdef __FreeBSD__
# include <sys/capsicum.h>
#endif
#include "base/eintr_wrapper.h"
#include "base/logging.h"
#include "base/string_util.h"
@ -51,6 +59,7 @@ bool SharedMemory::SetHandle(SharedMemoryHandle handle, bool read_only) {
freezeable_ = false;
mapped_file_.reset(handle.fd);
read_only_ = read_only;
// is_memfd_ only matters for freezing, which isn't possible
return true;
}
@ -62,11 +71,120 @@ bool SharedMemory::IsHandleValid(const SharedMemoryHandle& handle) {
// static
SharedMemoryHandle SharedMemory::NULLHandle() { return SharedMemoryHandle(); }
// memfd_create is a nonstandard interface for creating anonymous
// shared memory accessible as a file descriptor but not tied to any
// filesystem. It first appeared in Linux 3.17, and was adopted by
// FreeBSD in version 13.
#if !defined(HAVE_MEMFD_CREATE) && defined(OS_LINUX) && \
defined(SYS_memfd_create)
// Older libc versions (e.g., glibc before 2.27) don't have the
// wrapper, but we can supply our own; see `linux_memfd_defs.h`.
static int memfd_create(const char* name, unsigned int flags) {
return syscall(SYS_memfd_create, name, flags);
}
# define HAVE_MEMFD_CREATE 1
#endif
// memfd supports having "seals" applied to the file, to prevent
// various types of changes (which apply to all fds referencing the
// file). Unfortunately, we can't rely on F_SEAL_WRITE to implement
// Freeze(); see the comments in ReadOnlyCopy() below.
//
// Instead, to prevent a child process from regaining write access to
// a read-only copy, the OS must also provide a way to remove write
// permissions at the file descriptor level. This next section
// attempts to accomplish that.
#ifdef HAVE_MEMFD_CREATE
# define USE_MEMFD_CREATE 1
# ifdef OS_LINUX
// To create a read-only duplicate of an fd, we can use procfs; the
// same operation could restore write access, but sandboxing prevents
// child processes from accessing /proc.
static int DupReadOnly(int fd) {
std::string path = StringPrintf("/proc/self/fd/%d", fd);
// procfs opens probably won't EINTR, but checking for it can't hurt
return HANDLE_EINTR(open(path.c_str(), O_RDONLY | O_CLOEXEC));
}
# elif defined(__FreeBSD__)
// FreeBSD's Capsicum framework allows irrevocably restricting the
// operations permitted on a file descriptor.
static int DupReadOnly(int fd) {
int rofd = dup(fd);
if (rofd < 0) {
return -1;
}
cap_rights_t rights;
cap_rights_init(&rights, CAP_FSTAT, CAP_MMAP_R);
if (cap_rights_limit(rofd, &rights) < 0) {
int err = errno;
close(rofd);
errno = err;
return -1;
}
return rofd;
}
# else // unhandled OS
# warning "OS has memfd_create but no DupReadOnly implementation"
# undef USE_MEMFD_CREATE
# endif // OS selection
#endif // HAVE_MEMFD_CREATE
static bool HaveMemfd() {
#ifdef USE_MEMFD_CREATE
static const bool kHave = [] {
# ifdef OS_LINUX
// The Tor Browser project was, at one point, attempting to run
// Firefox in an environment without /proc mounted, to reduce
// possibilities for fingerprinting. If that's the case, we can't
// use memfd, because ReadOnlyCopy requires access to procfs to
// remove write permissions.
//
// Complicating this further, in a sandboxed child process, the
// first call to this function may happen after sandboxing is
// started; in that case, it's expected that procfs isn't
// reachable, but it's also expected that ReadOnlyCopy may not be
// possible.
if (!PR_GetEnv("MOZ_SANDBOXED") &&
access("/proc/self/fd", R_OK | X_OK) < 0) {
CHROMIUM_LOG(WARNING) << "can't use memfd without procfs";
return false;
}
# endif
int fd = memfd_create("mozilla-ipc-test", MFD_CLOEXEC | MFD_ALLOW_SEALING);
if (fd < 0) {
DCHECK_EQ(errno, ENOSYS);
return false;
}
close(fd);
return true;
}();
return kHave;
#else
return false;
#endif // USE_MEMFD_CREATE
}
// static
bool SharedMemory::AppendPosixShmPrefix(std::string* str, pid_t pid) {
#if defined(ANDROID)
return false;
#else
if (HaveMemfd()) {
return false;
}
*str += '/';
# ifdef OS_LINUX
// The Snap package environment doesn't provide a private /dev/shm
@ -103,48 +221,74 @@ bool SharedMemory::CreateInternal(size_t size, bool freezeable) {
mozilla::UniqueFileHandle fd;
mozilla::UniqueFileHandle frozen_fd;
bool needs_truncate = true;
bool is_memfd = false;
#ifdef ANDROID
// Android has its own shared memory facility:
fd.reset(mozilla::android::ashmem_create(nullptr, size));
if (!fd) {
CHROMIUM_LOG(WARNING) << "failed to open shm: " << strerror(errno);
return false;
}
needs_truncate = false;
#else
// Generic Unix: shm_open + shm_unlink
do {
// The names don't need to be unique, but it saves time if they
// usually are.
static mozilla::Atomic<size_t> sNameCounter;
std::string name;
CHECK(AppendPosixShmPrefix(&name, getpid()));
StringAppendF(&name, "%zu", sNameCounter++);
// O_EXCL means the names being predictable shouldn't be a problem.
fd.reset(
HANDLE_EINTR(shm_open(name.c_str(), O_RDWR | O_CREAT | O_EXCL, 0600)));
if (fd) {
if (freezeable) {
frozen_fd.reset(HANDLE_EINTR(shm_open(name.c_str(), O_RDONLY, 0400)));
if (!frozen_fd) {
int open_err = errno;
shm_unlink(name.c_str());
DLOG(FATAL) << "failed to re-open freezeable shm: "
<< strerror(open_err);
return false;
}
}
if (shm_unlink(name.c_str()) != 0) {
// This shouldn't happen, but if it does: assume the file is
// in fact leaked, and bail out now while it's still 0-length.
DLOG(FATAL) << "failed to unlink shm: " << strerror(errno);
#ifdef USE_MEMFD_CREATE
if (HaveMemfd()) {
const unsigned flags = MFD_CLOEXEC | (freezeable ? MFD_ALLOW_SEALING : 0);
fd.reset(memfd_create("mozilla-ipc", flags));
if (!fd) {
// In general it's too late to fall back here -- in a sandboxed
// child process, shm_open is already blocked. And it shouldn't
// be necessary.
CHROMIUM_LOG(WARNING) << "failed to create memfd: " << strerror(errno);
return false;
}
is_memfd = true;
if (freezeable) {
frozen_fd.reset(DupReadOnly(fd.get()));
if (!frozen_fd) {
CHROMIUM_LOG(WARNING)
<< "failed to create read-only memfd: " << strerror(errno);
return false;
}
}
} while (!fd && errno == EEXIST);
}
#endif
if (!fd) {
#ifdef ANDROID
// Android has its own shared memory facility:
fd.reset(mozilla::android::ashmem_create(nullptr, size));
if (!fd) {
CHROMIUM_LOG(WARNING) << "failed to open shm: " << strerror(errno);
return false;
}
needs_truncate = false;
#else
// Generic Unix: shm_open + shm_unlink
do {
// The names don't need to be unique, but it saves time if they
// usually are.
static mozilla::Atomic<size_t> sNameCounter;
std::string name;
CHECK(AppendPosixShmPrefix(&name, getpid()));
StringAppendF(&name, "%zu", sNameCounter++);
// O_EXCL means the names being predictable shouldn't be a problem.
fd.reset(HANDLE_EINTR(
shm_open(name.c_str(), O_RDWR | O_CREAT | O_EXCL, 0600)));
if (fd) {
if (freezeable) {
frozen_fd.reset(HANDLE_EINTR(shm_open(name.c_str(), O_RDONLY, 0400)));
if (!frozen_fd) {
int open_err = errno;
shm_unlink(name.c_str());
DLOG(FATAL) << "failed to re-open freezeable shm: "
<< strerror(open_err);
return false;
}
}
if (shm_unlink(name.c_str()) != 0) {
// This shouldn't happen, but if it does: assume the file is
// in fact leaked, and bail out now while it's still 0-length.
DLOG(FATAL) << "failed to unlink shm: " << strerror(errno);
return false;
}
}
} while (!fd && errno == EEXIST);
#endif
}
if (!fd) {
CHROMIUM_LOG(WARNING) << "failed to open shm: " << strerror(errno);
return false;
@ -200,6 +344,7 @@ bool SharedMemory::CreateInternal(size_t size, bool freezeable) {
frozen_file_ = std::move(frozen_fd);
max_size_ = size;
freezeable_ = freezeable;
is_memfd_ = is_memfd;
return true;
}
@ -213,22 +358,74 @@ bool SharedMemory::ReadOnlyCopy(SharedMemory* ro_out) {
}
mozilla::UniqueFileHandle ro_file;
bool is_ashmem = false;
#ifdef ANDROID
ro_file = std::move(mapped_file_);
if (mozilla::android::ashmem_setProt(ro_file.get(), PROT_READ) != 0) {
CHROMIUM_LOG(WARNING) << "failed to set ashmem read-only: "
<< strerror(errno);
return false;
if (!is_memfd_) {
is_ashmem = true;
DCHECK(!frozen_file_);
ro_file = std::move(mapped_file_);
if (mozilla::android::ashmem_setProt(ro_file.get(), PROT_READ) != 0) {
CHROMIUM_LOG(WARNING)
<< "failed to set ashmem read-only: " << strerror(errno);
return false;
}
}
#else
DCHECK(frozen_file_);
mapped_file_ = nullptr;
ro_file = std::move(frozen_file_);
#endif
#ifdef USE_MEMFD_CREATE
static const bool useSeals = !PR_GetEnv("MOZ_SHM_NO_SEALS");
if (is_memfd_ && useSeals) {
// Seals are added to the file as defense-in-depth. The primary
// method of access control is creating a read-only fd (using
// procfs in this case) and requiring that sandboxes processes not
// have access to /proc/self/fd to regain write permission; this
// is the same as with shm_open.
//
// Unfortunately, F_SEAL_WRITE is unreliable: if the process
// forked while there was a writeable mapping, it will inherit a
// copy of the mapping, which causes the seal to fail.
//
// (Also, in the future we may want to split this into separate
// classes for mappings and shared memory handles, which would
// complicate identifying the case where `F_SEAL_WRITE` would be
// possible even in the absence of races with fork.)
//
// However, Linux 5.1 added F_SEAL_FUTURE_WRITE, which prevents
// write operations afterwards, but existing writeable mappings
// are unaffected (similar to ashmem protection semantics).
const int seals = F_SEAL_GROW | F_SEAL_SHRINK | F_SEAL_SEAL;
int sealError = EINVAL;
# ifdef F_SEAL_FUTURE_WRITE
sealError =
fcntl(mapped_file_.get(), F_ADD_SEALS, seals | F_SEAL_FUTURE_WRITE) == 0
? 0
: errno;
# endif // F_SEAL_FUTURE_WRITE
if (sealError == EINVAL) {
sealError =
fcntl(mapped_file_.get(), F_ADD_SEALS, seals) == 0 ? 0 : errno;
}
if (sealError != 0) {
CHROMIUM_LOG(WARNING) << "failed to seal memfd: " << strerror(errno);
return false;
}
}
#else // !USE_MEMFD_CREATE
DCHECK(!is_memfd_);
#endif
if (!is_ashmem) {
DCHECK(frozen_file_);
DCHECK(mapped_file_);
mapped_file_ = nullptr;
ro_file = std::move(frozen_file_);
}
DCHECK(ro_file);
freezeable_ = false;
ro_out->Close();
ro_out->mapped_file_ = std::move(ro_file);
ro_out->max_size_ = max_size_;

View File

@ -704,6 +704,10 @@ class SandboxPolicyCommon : public SandboxPolicyBase {
case __NR_munmap:
return Allow();
// Shared memory
case __NR_memfd_create:
return Allow();
// ipc::Shmem; also, glibc when creating threads:
case __NR_mprotect:
return Allow();
@ -1395,11 +1399,6 @@ class ContentSandboxPolicy : public SandboxPolicyCommon {
case __NR_eventfd2:
return Allow();
# ifdef __NR_memfd_create
case __NR_memfd_create:
return Allow();
# endif
# ifdef __NR_rt_tgsigqueueinfo
// Only allow to send signals within the process.
case __NR_rt_tgsigqueueinfo: {