Support XNU continuations, reenable kqueue Mach port support

This commit is contained in:
Lubos Dolezel 2020-01-24 17:20:23 +01:00
parent 8b5de389aa
commit 3e4d6aff01
11 changed files with 310 additions and 96 deletions

View File

@ -225,6 +225,8 @@ $(info Invoked by kernel build system, building for $(KERNELRELEASE))
darling/pthread_kill.o \
darling/psynch_support.o \
darling/foreign_mm.o \
darling/continuation.o \
darling/continuation-asm.o \
osfmk/duct/darling_xnu_init.o \
osfmk/duct/duct_atomic.o \
osfmk/duct/duct_ipc_pset.o \

View File

@ -0,0 +1,43 @@
/* Copyright (C) 2018 Intel Corporation */
#include <linux/linkage.h>
.text
.align 8
ENTRY(cont_setjmp)
pop %rcx
movq %rcx, (%rdi) /* Return address */
movq %rsp, 8(%rdi)
movq %rbp, 16(%rdi)
movq %rbx, 24(%rdi)
movq %r12, 32(%rdi)
movq %r13, 40(%rdi)
movq %r14, 48(%rdi)
movq %r15, 56(%rdi)
xorq %rax, %rax /* Direct invocation returns 0 */
jmpq *%rcx
ENDPROC(cont_setjmp)
.align 8
ENTRY(cont_longjmp)
movq (%rdi), %rcx /* Return address */
movq 8(%rdi), %rsp
movq 16(%rdi), %rbp
movq 24(%rdi), %rbx
movq 32(%rdi), %r12
movq 40(%rdi), %r13
movq 48(%rdi), %r14
movq 56(%rdi), %r15
movq %rsi, %rax /* Value to be returned by setjmp() */
testq %rax, %rax /* cannot be 0 in this case */
jnz 1f
incq %rax /* Return 1 instead */
1:
jmpq *%rcx
ENDPROC(cont_longjmp)

41
darling/continuation.c Normal file
View File

@ -0,0 +1,41 @@
/*
* Darling Mach Linux Kernel Module
* Copyright (C) 2020 Lubos Dolezel
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public License
* as published by the Free Software Foundation; either version 2
* of the License, or (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
*/
#include <duct/duct.h>
#include <duct/duct_pre_xnu.h>
#include <osfmk/kern/thread.h>
#include <duct/duct_kern_debug.h>
#include <duct/duct_post_xnu.h>
#include "continuation.h"
#include "task_registry.h"
void cont_discard(struct cont_jmpbuf* buf)
{
buf->__rip = 0;
}
void thread_syscall_return(kern_return_t ret)
{
thread_t myself = darling_thread_get_current();
if (((struct cont_jmpbuf*) myself->cont_jmpbuf)->__rip == 0)
duct_panic("thread_syscall_return invoked, but XNU_CONTINUATION_ENABLED() was not used!");
else
cont_longjmp((struct cont_jmpbuf*) myself->cont_jmpbuf, ret);
}

60
darling/continuation.h Normal file
View File

@ -0,0 +1,60 @@
/*
* Darling Mach Linux Kernel Module
* Copyright (C) 2020 Lubos Dolezel
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public License
* as published by the Free Software Foundation; either version 2
* of the License, or (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
*/
#ifndef _CONTINUATION_H
#define _CONTINUATION_H
// Yes, we brought setjmp/longjmp to the kernel space
#ifdef CONFIG_X86_64
struct cont_jmpbuf
{
unsigned long __rip;
unsigned long __rsp;
unsigned long __rbp;
unsigned long __rbx;
unsigned long __r12;
unsigned long __r13;
unsigned long __r14;
unsigned long __r15;
};
#endif
void cont_discard(struct cont_jmpbuf* buf);
// In assembly
int cont_setjmp(struct cont_jmpbuf* buf);
void cont_longjmp(struct cont_jmpbuf* buf, int v);
#define XNU_CONTINUATION_ENABLED(call) ({ \
thread_t myself = darling_thread_get_current(); \
int rv; \
int v = cont_setjmp((struct cont_jmpbuf*)myself->cont_jmpbuf); \
if (v == 0) { \
rv = call; \
/* No continuation used */ \
} else { \
rv = (v == 1) ? 0 : v; \
} \
cont_discard((struct cont_jmpbuf*) myself->cont_jmpbuf); \
rv; \
})
#endif

View File

@ -48,12 +48,16 @@ struct evpsetfd_ctx
SLIST_ENTRY(evpsetfd_ctx) kn_selnext;
wait_queue_head_t wait_queue;
struct evpset_event event;
int has_event;
};
static int evpsetfd_release(struct inode* inode, struct file* file);
static unsigned int evpsetfd_poll(struct file* file, poll_table* wait);
static ssize_t evpsetfd_read(struct file* file, char __user* buf, size_t count, loff_t* ppos);
static ssize_t evpsetfd_write(struct file* file, const char __user *buf, size_t count, loff_t* ppos);
int knote_attach_evpset(struct klist* list, struct evpsetfd_ctx* kn);
int knote_detach_evpset(struct klist* list, struct evpsetfd_ctx* kn);
static const struct file_operations evpsetfd_ops =
{
@ -84,6 +88,9 @@ int evpsetfd_create(unsigned int port_name, const struct evpset_options* opts)
ctx->port_name = port_name;
ctx->pset = pset;
memset(&ctx->event, 0, sizeof(ctx->event));
init_waitqueue_head(&ctx->wait_queue);
memcpy(&ctx->opts, opts, sizeof(*opts));
@ -97,6 +104,12 @@ int evpsetfd_create(unsigned int port_name, const struct evpset_options* opts)
return fd;
}
ipc_mqueue_t mqueue = &pset->ips_messages;
knote_attach_evpset(&mqueue->imq_klist, ctx);
ctx->has_event = ipc_mqueue_set_peek(mqueue);
debug_msg(" are there events already? %d!\n", ctx->has_event);
ips_reference(pset);
ips_unlock(pset);
@ -107,6 +120,9 @@ int evpsetfd_release(struct inode* inode, struct file* file)
{
struct evpsetfd_ctx* ctx = (struct evpsetfd_ctx*) file->private_data;
ipc_mqueue_t mqueue = &ctx->pset->ips_messages;
knote_detach_evpset(&mqueue->imq_klist, ctx);
ips_release(ctx->pset);
kfree(ctx);
@ -118,16 +134,9 @@ unsigned int evpsetfd_poll(struct file* file, poll_table* wait)
struct evpsetfd_ctx* ctx = (struct evpsetfd_ctx*) file->private_data;
ipc_mqueue_t set_mq = &ctx->pset->ips_messages;
int res = wait_event_interruptible(ctx->wait_queue, ipc_mqueue_peek(set_mq, NULL, NULL, NULL, NULL, NULL));
poll_wait(file, &ctx->wait_queue, wait);
if (res == -LINUX_ERESTARTSYS)
{
debug_msg("evpsetfd_poll(): interrupted\n");
return 0;
}
debug_msg("evpsetfd_poll(): there is a pending msg\n");
return POLLIN | POLLRDNORM;
return ctx->has_event ? (POLLIN | POLLRDNORM) : 0;
}
ssize_t evpsetfd_read(struct file* file, char __user* buf, size_t count, loff_t* ppos)
@ -137,64 +146,70 @@ ssize_t evpsetfd_read(struct file* file, char __user* buf, size_t count, loff_t*
debug_msg("evpsetfd_read() called\n");
ctx->has_event = false;
if (count != sizeof(struct evpset_event))
return -LINUX_EINVAL;
// Taken from XNU's filt_machport()
// and adapted to return struct evpset_event
mach_port_name_t name = ctx->port_name;
ipc_pset_t pset = IPS_NULL;
wait_result_t wresult;
thread_t self = current_thread();
kern_return_t kr;
mach_msg_option_t option;
mach_msg_size_t size;
ipc_mqueue_t mqueue = &ctx->pset->ips_messages;
ipc_object_t object = &ctx->pset->ips_object;
thread_t self = current_thread();
boolean_t used_filtprocess_data = FALSE;
/* never called from below */
// assert(hint == 0);
memset(&out, 0, sizeof(out));
wait_result_t wresult;
mach_msg_option_t option;
mach_vm_address_t addr;
mach_msg_size_t size;
/*
* called from user context. Have to validate the
* name. If it changed, we have an EOF situation.
*/
kr = ipc_object_translate(current_space(), name,
MACH_PORT_RIGHT_PORT_SET,
(ipc_object_t *)&pset);
if (kr != KERN_SUCCESS || pset != ctx->pset || !ips_active(pset)) {
out.port = 0;
out.flags |= (EV_EOF | EV_ONESHOT);
if (pset != IPS_NULL) {
ips_unlock(pset);
}
if (copy_to_user(buf, &out, sizeof(out)))
return -LINUX_EFAULT;
imq_lock(mqueue);
return sizeof(out);
/* Capture current state */
//*kev = kn->kn_kevent;
/* If already deallocated/moved return one last EOF event */
if (ctx->event.flags & EV_EOF) {
imq_unlock(mqueue);
//return 1;
goto out;
}
/* just use the reference from here on out */
ips_reference(pset);
ips_unlock(pset);
/*
* Only honor supported receive options. If no options are
* provided, just force a MACH_RCV_TOO_LARGE to detect the
* name of the port and sizeof the waiting message.
*/
option = ctx->opts.sfflags & (MACH_RCV_MSG|MACH_RCV_LARGE|MACH_RCV_TRAILER_MASK);
option = ctx->opts.sfflags & (MACH_RCV_MSG|MACH_RCV_LARGE|MACH_RCV_LARGE_IDENTITY|
MACH_RCV_TRAILER_MASK|MACH_RCV_VOUCHER);
if (option & MACH_RCV_MSG) {
if (ctx->opts.rcvbuf_size == 0)
option |= MACH_RCV_LARGE;
self->ith_msg_addr = (mach_vm_address_t) ctx->opts.rcvbuf;
size = (mach_msg_size_t)ctx->opts.rcvbuf_size;
addr = (mach_vm_address_t) ctx->opts.rcvbuf;
size = (mach_msg_size_t) ctx->opts.rcvbuf_size;
#if 0
/*
* If the kevent didn't specify a buffer and length, carve a buffer
* from the filter processing data according to the flags.
*/
if (size == 0 && process_data != NULL) {
used_filtprocess_data = TRUE;
addr = (mach_vm_address_t)process_data->fp_data_out;
size = (mach_msg_size_t)process_data->fp_data_resid;
option |= (MACH_RCV_LARGE | MACH_RCV_LARGE_IDENTITY);
if (process_data->fp_flags & KEVENT_FLAG_STACK_DATA)
option |= MACH_RCV_STACK;
}
#endif
} else {
/* just detect the port name (if a set) and size of the first message */
option = MACH_RCV_LARGE;
self->ith_msg_addr = 0;
addr = 0;
size = 0;
}
/* just use the reference from here on out */
io_reference(object);
/*
* Set up to receive a message or the notification of a
* too large message. But never allow this call to wait.
@ -202,35 +217,37 @@ ssize_t evpsetfd_read(struct file* file, char __user* buf, size_t count, loff_t*
* options, pass those through here. But we don't support
* scatter lists through this interface.
*/
self->ith_object = (ipc_object_t)pset;
self->ith_msize = size;
self->ith_object = object;
self->ith_msg_addr = addr;
self->ith_rsize = size;
self->ith_msize = 0;
self->ith_option = option;
// self->ith_scatter_list_size = 0;
self->ith_receiver_name = MACH_PORT_NULL;
self->ith_continuation = NULL;
option |= MACH_RCV_TIMEOUT; // never wait
assert((self->ith_state = MACH_RCV_IN_PROGRESS) == MACH_RCV_IN_PROGRESS);
self->ith_state = MACH_RCV_IN_PROGRESS;
wresult = ipc_mqueue_receive_on_thread(
&pset->ips_messages,
mqueue,
option,
size, /* max_size */
0, /* immediate timeout */
THREAD_INTERRUPTIBLE,
self);
assert(wresult == THREAD_NOT_WAITING);
assert(self->ith_state != MACH_RCV_IN_PROGRESS);
/* mqueue unlocked */
debug_msg("- ith_state after receive_on_thread: %d\n", self->ith_state);
/*
* If we timed out, just release the reference on the
* portset and return zero.
* If we timed out, or the process is exiting, just release the
* reference on the ipc_object and return zero.
*/
if (self->ith_state == MACH_RCV_TIMED_OUT) {
ips_release(pset);
if (wresult == THREAD_RESTART || self->ith_state == MACH_RCV_TIMED_OUT) {
io_release(object);
return -LINUX_EAGAIN;
}
assert(wresult == THREAD_NOT_WAITING);
assert(self->ith_state != MACH_RCV_IN_PROGRESS);
/*
* If we weren't attempting to receive a message
* directly, we need to return the port name in
@ -239,28 +256,72 @@ ssize_t evpsetfd_read(struct file* file, char __user* buf, size_t count, loff_t*
if ((option & MACH_RCV_MSG) != MACH_RCV_MSG) {
assert(self->ith_state == MACH_RCV_TOO_LARGE);
assert(self->ith_kmsg == IKM_NULL);
// kn->kn_data = self->ith_receiver_name;
out.port = self->ith_receiver_name;
ips_release(pset);
if (copy_to_user(buf, &out, sizeof(out)))
return -LINUX_EFAULT;
return sizeof(out);
ctx->event.port = self->ith_receiver_name;
io_release(object);
return -LINUX_EAGAIN;
}
/*
* Attempt to receive the message directly, returning
* the results in the fflags field.
*/
assert(option & MACH_RCV_MSG);
out.port = self->ith_receiver_name;
out.msg_size = self->ith_msize;
out.receive_status = mach_msg_receive_results(NULL);
/* kmsg and pset reference consumed */
ctx->event.receive_status = mach_msg_receive_results(&size);
if (copy_to_user(buf, &out, sizeof(out)))
/* kmsg and object reference consumed */
/*
* if the user asked for the identity of ports containing a
* a too-large message, return it in the data field (as we
* do for messages we didn't try to receive).
*/
if (ctx->event.receive_status == MACH_RCV_TOO_LARGE) {
ctx->event.msg_size = self->ith_msize;
if (option & MACH_RCV_LARGE_IDENTITY)
ctx->event.port = self->ith_receiver_name;
else
ctx->event.port = MACH_PORT_NULL;
} else {
ctx->event.msg_size = size;
ctx->event.port = MACH_PORT_NULL;
}
#if 0
/*
* If we used a data buffer carved out from the filt_process data,
* store the address used in the knote and adjust the residual and
* other parameters for future use.
*/
if (used_filtprocess_data) {
assert(process_data->fp_data_resid >= size);
process_data->fp_data_resid -= size;
if ((process_data->fp_flags & KEVENT_FLAG_STACK_DATA) == 0) {
kev->ext[0] = process_data->fp_data_out;
process_data->fp_data_out += size;
} else {
assert(option & MACH_RCV_STACK);
kev->ext[0] = process_data->fp_data_out +
process_data->fp_data_resid;
}
}
/*
* Apply message-based QoS values to output kevent as prescribed.
* The kev->qos field gets max(msg-qos, kn->kn_qos).
* The kev->ext[2] field gets (msg-qos << 32) | (override-qos).
*
* The mach_msg_receive_results() call saved off the message
* QoS values in the continuation save area on successful receive.
*/
if (ctx->event.receive_status == MACH_MSG_SUCCESS) {
kev->qos = mach_msg_priority_combine(self->ith_qos, kn->kn_qos);
kev->ext[2] = ((uint64_t)self->ith_qos << 32) |
(uint64_t)self->ith_qos_override;
}
#endif
out:
if (copy_to_user(buf, &ctx->event, sizeof(ctx->event)))
return -LINUX_EFAULT;
memset(&ctx->event, 0, sizeof(ctx->event));
return sizeof(out);
}
@ -283,12 +344,16 @@ ssize_t evpsetfd_write(struct file* file, const char __user *buf, size_t count,
void knote(struct klist* list, long hint)
{
debug_msg("knote() called with hint=0x%x\n", hint);
debug_msg("knote() on list 0x%x called with hint=0x%x\n", list, hint);
struct evpsetfd_ctx* kn;
SLIST_FOREACH(kn, list, kn_selnext)
{
// TODO: hint?
if (hint == NOTE_REVOKE)
kn->event.flags = EV_EOF | EV_ONESHOT;
kn->has_event = true;
debug_msg("knote() is waking up a Linux wait queue 0x%x\n", &kn->wait_queue);
wake_up_interruptible(&kn->wait_queue);
}
}
@ -301,12 +366,14 @@ void klist_init(struct klist* list)
int knote_attach_evpset(struct klist* list, struct evpsetfd_ctx* kn)
{
int ret = SLIST_EMPTY(list);
debug_msg("Attaching to klist 0x%x\n", list);
SLIST_INSERT_HEAD(list, kn, kn_selnext);
return ret;
}
int knote_detach_evpset(struct klist* list, struct evpsetfd_ctx* kn)
{
debug_msg("Detaching from klist 0x%x\n", list);
SLIST_REMOVE(list, kn, evpsetfd_ctx, kn_selnext);
return SLIST_EMPTY(list);
}

View File

@ -1,6 +1,6 @@
/*
* Darling Mach Linux Kernel Module
* Copyright (C) 2015-2018 Lubos Dolezel
* Copyright (C) 2015-2020 Lubos Dolezel
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public License
@ -56,6 +56,7 @@
#include "binfmt.h"
#include "commpage.h"
#include "foreign_mm.h"
#include "continuation.h"
#if LINUX_VERSION_CODE >= KERNEL_VERSION(4,11,0)
#define current linux_current
@ -725,7 +726,7 @@ int mach_msg_overwrite_entry(task_t task, struct mach_msg_overwrite_args* in_arg
// out.notify = args.notify;
out.rcv_msg = (user_addr_t) args.rcv_msg;
return mach_msg_overwrite_trap(&out);
return XNU_CONTINUATION_ENABLED(mach_msg_overwrite_trap(&out));
}
int _kernelrpc_mach_port_deallocate_entry(task_t task, struct mach_port_deallocate_args* in_args)
@ -736,7 +737,7 @@ int _kernelrpc_mach_port_deallocate_entry(task_t task, struct mach_port_dealloca
out.target = args.task_right_name;
out.name = args.port_right_name;
return _kernelrpc_mach_port_deallocate_trap(&out);
return XNU_CONTINUATION_ENABLED(_kernelrpc_mach_port_deallocate_trap(&out));
}
int _kernelrpc_mach_port_destroy(task_t task, struct mach_port_destroy_args* in_args)
@ -747,7 +748,7 @@ int _kernelrpc_mach_port_destroy(task_t task, struct mach_port_destroy_args* in_
out.target = args.task_right_name;
out.name = args.port_right_name;
return _kernelrpc_mach_port_destroy_trap(&out);
return XNU_CONTINUATION_ENABLED(_kernelrpc_mach_port_destroy_trap(&out));
}
int _kernelrpc_mach_port_insert_right_entry(task_t task, struct mach_port_insert_right_args* in_args)

View File

@ -152,11 +152,13 @@ thread_poll_yield(thread_t self)
// osfmk/i386/trap.c
#if 0
void
thread_syscall_return(kern_return_t ret)
{
kprintf("not implemented: thread_syscall_return()\n");
}
#endif
// osfmk/kern/startup.c

View File

@ -57,6 +57,7 @@ void duct_panic (const char * str, ...)
// well, we should never use linux_panic
printk (KERN_NOTICE "PANIC: %s", str);
__WARN();
}
void Assert (const char * file, int line, const char * expression)
@ -70,6 +71,7 @@ void Assert (const char * file, int line, const char * expression)
// saved_return_on_panic = return_on_panic;
// return_on_panic = 1;
printk (KERN_NOTICE "FAILED ASSERTION[%s:%d]: %s", file, line, expression);
__WARN();
// return_on_panic = saved_return_on_panic;
}

View File

@ -298,6 +298,8 @@ static kern_return_t duct_thread_create_internal (task_t parent_task, integer_t
new_thread->task = parent_task;
new_thread->ref_count = 2;
new_thread->waitq = NULL;
new_thread->thread_magic = THREAD_MAGIC;
thread_lock_init(new_thread);
// wake_lock_init(new_thread);
@ -644,12 +646,6 @@ wait_result_t thread_mark_wait_locked(thread_t thread, wait_interrupt_t interrup
wait_result_t thread_block(thread_continue_t cont)
{
if (cont != THREAD_CONTINUE_NULL)
{
panic("thread_block: continuations are not supported!");
return 0;
}
thread_t thread = current_thread();
thread->wait_result = THREAD_AWAKENED;
@ -662,6 +658,12 @@ wait_result_t thread_block(thread_continue_t cont)
thread->wait_result = THREAD_INTERRUPTED;
set_current_state(TASK_RUNNING);
if (cont != THREAD_CONTINUE_NULL)
{
cont(NULL, thread->wait_result);
panic("thread_block: continuation isn't supposed to return!");
}
return thread->wait_result;
}

View File

@ -835,15 +835,6 @@ ipc_mqueue_post(
break;
}
#if defined (__DARLING__)
// queue message and break anyways as we don't know how to handle this case yet
printk (KERN_NOTICE "- BUG: don't know how to handle\n");
#endif
#if 0
ipc_kmsg_enqueue_macro (&mqueue->imq_messages, kmsg);
thread_unlock (receiver);
break;
#endif
/*
* Otherwise, this thread needs to be released to run
* and handle its error without getting the message. We

View File

@ -525,6 +525,9 @@ struct thread {
#if defined (__DARLING__)
struct task_struct * linux_task;
// linux_wait_queue_t lwait;
#ifdef __x86_64__
unsigned long cont_jmpbuf[8];
#endif
struct ksyn_waitq_element uu_kwe;
uint64_t triggered_watchpoint_address;
unsigned int triggered_watchpoint_operation;
@ -572,7 +575,7 @@ struct thread {
#define sth_result saved.sema.result
#define sth_continuation saved.sema.continuation
#if MACH_ASSERT
#if MACH_ASSERT && !defined(__DARLING__)
#define assert_thread_magic(thread) assertf((thread)->thread_magic == THREAD_MAGIC, \
"bad thread magic 0x%llx for thread %p, expected 0x%llx", \
(thread)->thread_magic, (thread), THREAD_MAGIC)