darling-newlkm/darling/kqueue.c
2022-01-24 14:21:06 -05:00

1317 lines
42 KiB
C

#include <duct/duct.h>
#include <duct/compiler/clang/asm-inline.h>
#include <linux/slab.h>
#include <linux/anon_inodes.h>
#include <linux/fs.h>
#define current linux_current
#include <linux/fdtable.h>
#undef current
#include <linux/poll.h>
#if 0 // TODO: EVFILT_SOCK support (we've gotta fix some header collisions)
#include <linux/net.h>
#include <net/sock.h>
#endif
#include <linux/fsnotify_backend.h>
#include <linux/workqueue.h>
#if 0
// unconditionally print messages; useful for debugging kqueue by itself (enabling debug output for the entire LKM produces *lots* of output)
#define dkqueue_log(x, ...) printk("\0015<%d> dkqueue: " x "\n", linux_current->pid, ##__VA_ARGS__)
#else
#include "debug_print.h"
#define dkqueue_log(x, ...) debug_msg("dkqueue: " x "\n", ##__VA_ARGS__)
#endif
// we include it so we can avoid modifiying it unnecesarily to get some of its static functions
#include "../bsd/kern/kern_event.c"
#include <duct/duct_pre_xnu.h>
#include <vm/vm_map.h>
#include <sys/queue.h>
#include <duct/duct_post_xnu.h>
#include "kqueue.h"
#include "task_registry.h"
#if LINUX_VERSION_CODE < KERNEL_VERSION(5,11,0)
# define darling_fcheck_files fcheck_files
# define darling_close_fd ksys_close
#else
# define darling_fcheck_files files_lookup_fd_rcu
# define darling_close_fd close_fd
#endif
// re-define `fcheck` because we use `linux_current`
#define fcheck(fd) darling_fcheck_files(linux_current->files, fd)
struct dkqueue_pte;
typedef SLIST_HEAD(dkqueue_pte_head, dkqueue_pte) dkqueue_pte_head_t;
typedef SLIST_ENTRY(dkqueue_pte) dkqueue_pte_entry_t;
// pte = poll table entry
typedef struct dkqueue_pte {
wait_queue_entry_t wqe;
wait_queue_head_t* wqh;
dkqueue_pte_entry_t link;
} dkqueue_pte_t;
// ptc = poll table container
typedef struct dkqueue_ptc {
poll_table pt;
struct knote* kn;
dkqueue_pte_head_t pte_head;
struct task_struct* polling_task;
struct work_struct activator;
bool triggered;
// socket state tracking
bool connected: 1;
bool send_shutdown: 1;
bool receive_shutdown: 1;
} dkqueue_ptc_t;
typedef struct dkqueue_proc_context {
struct knote* kn;
int xnu_pid;
int64_t registration_id;
int32_t saved_data;
bool destroy_it;
} dkqueue_proc_context_t;
typedef struct dkqueue_vnode_context {
struct fsnotify_mark fs_mark;
struct knote* kn;
loff_t cached_size; // used to determine if a file was extended or not (for NOTE_EXTEND)
unsigned int cached_nlink; // used to determine if a file's link count changed (for NOTE_LINK)
} dkqueue_vnode_context_t;
static struct kmem_cache* container_cache = NULL;
static struct kmem_cache* entry_cache = NULL;
static struct kmem_cache* proc_context_cache = NULL;
static struct kmem_cache* dkqueue_list_entry_cache = NULL;
static struct kmem_cache* vnode_context_cache = NULL;
static struct workqueue_struct* file_activation_wq = NULL;
// TODO: fs filter
const struct filterops fs_filtops = {
.f_attach = filt_no_attach,
.f_detach = filt_no_detach,
.f_event = filt_bad_event,
.f_touch = filt_bad_touch,
.f_process = filt_bad_process,
};
// our own functions are prefixed with `dkq` or `dkqueue` (Darling kqueue) to avoid possible conflicts
// with current or future XNU kqueue functions (because we try to defer to XNU's code as much as possible)
static unsigned int dkqueue_poll(struct file* file, poll_table* wait);
static int dkqueue_release(struct inode* inode, struct file* file);
static int dkqueue_file_attach(struct knote* kn, struct kevent_qos_s* kev);
static void dkqueue_file_detach(struct knote* kn);
static int dkqueue_file_event(struct knote* kn, long hint);
static int dkqueue_file_touch(struct knote* kn, struct kevent_qos_s* kev);
static int dkqueue_file_process(struct knote* kn, struct kevent_qos_s* kev);
static int dkqueue_file_peek(struct knote* kn);
static void dkqueue_file_init(void);
static int dkqueue_file_peek_with_poll_result(struct knote* kn, __poll_t poll_result);
static void dkqueue_poll_unsubscribe_all(dkqueue_ptc_t* ptc);
static int dkqueue_perform_default_wakeup(wait_queue_entry_t* wqe, unsigned int mode, int sync, void* context);
static int dkqueue_poll_wakeup_callback(wait_queue_entry_t* wqe, unsigned int mode, int sync, void* context);
static void dkqueue_poll_wait_callback(struct file* file, wait_queue_head_t* wqh, poll_table* pt);
static int dkqueue_proc_attach(struct knote* kn, struct kevent_qos_s* kev);
static void dkqueue_proc_detach(struct knote* kn);
static int dkqueue_proc_event(struct knote* kn, long hint);
static int dkqueue_proc_touch(struct knote* kn, struct kevent_qos_s* kev);
static int dkqueue_proc_process(struct knote* kn, struct kevent_qos_s* kev);
static void dkqueue_proc_init(void);
static void dkqueue_proc_listener(int pid, void* context, darling_proc_event_t event, unsigned long extra);
static void dkqueue_fork_listener(int pid, void* context, darling_proc_event_t event, unsigned long extra);
static int dkqueue_vnode_attach(struct knote* kn, struct kevent_qos_s* kev);
static void dkqueue_vnode_detach(struct knote* kn);
static int dkqueue_vnode_event(struct knote* kn, long hint);
static int dkqueue_vnode_touch(struct knote* kn, struct kevent_qos_s* kev);
static int dkqueue_vnode_process(struct knote* kn, struct kevent_qos_s* kev);
static int dkqueue_vnode_peek(struct knote* kn);
static void dkqueue_vnode_init(void);
#if LINUX_VERSION_CODE >= KERNEL_VERSION(5,9,0)
static int dkqueue_vnode_fs_handle_event(struct fsnotify_group* group, u32 mask, const void* data, int data_type, struct inode* dir, const struct qstr* file_name, u32 cookie, struct fsnotify_iter_info* iter_info);
#else
static int dkqueue_vnode_fs_handle_event(struct fsnotify_group* group, struct inode* inode, u32 mask, const void* data, int data_type, const struct qstr* file_name, u32 cookie, struct fsnotify_iter_info* iter_info);
#endif
static void dkqueue_vnode_free_group(struct fsnotify_group *group);
#if LINUX_VERSION_CODE < KERNEL_VERSION(5,16,0)
static void dkqueue_vnode_fs_free_event(struct fsnotify_event* event);
#else
static void dkqueue_vnode_fs_free_event(struct fsnotify_group*, struct fsnotify_event* event);
#endif
static void dkqueue_vnode_fs_free_mark(struct fsnotify_mark* mark);
static void dkqueue_vnode_update_wanted_events(dkqueue_vnode_context_t* ctx);
static void vnode_context_lock(dkqueue_vnode_context_t* ctx);
static void vnode_context_unlock(dkqueue_vnode_context_t* ctx);
static const struct file_operations dkqueue_ops = {
//.read = dkqueue_read,
//.write = dkqueue_write,
.poll = dkqueue_poll,
.release = dkqueue_release,
};
const struct fsnotify_ops dkqueue_vnode_fsops = {
.handle_event = dkqueue_vnode_fs_handle_event,
.free_group_priv = dkqueue_vnode_free_group,
.free_event = dkqueue_vnode_fs_free_event,
.free_mark = dkqueue_vnode_fs_free_mark,
};
//static ssize_t dkqueue_read(struct file* file, char __user* buf, size_t count, loff_t* ppos) {};
//static ssize_t dkqueue_write(struct file* file, const char __user *buf, size_t count, loff_t* ppos) {};
static unsigned int dkqueue_poll(struct file* file, poll_table* wait) {
struct kqfile* kqf = (struct kqfile*)file->private_data;
int result;
dkqueue_log("Linux wants to poll kqueue %p on file %p", kqf, file);
// tell Linux to wait for the Linux wait_queue attached to our kqueue.
// it'll be woken up in `osfmk/kern/waitq.c` when the XNU waitq receives NO_EVENT64
// (which is what `KQ_EVENT` is defined to)
poll_wait(file, &kqf->kqf_wqs.wqset_q.linux_wq, wait);
if (!poll_does_not_wait(wait)) {
kqlock(kqf);
// remember to set this; otherwise no one will wake us up when a knote gets activated!
// we only set it when we begin to wait, though, to mimic XNU's behavior
kqf->kqf_state |= KQ_SEL;
kqunlock(kqf);
}
// XNU `select` normally provides a value for wq_link_id so that the fileop links its waitq onto the thread's waitq_set
// (much like Linux `file_operations::poll`), however we *don't* want kqueue to do that because:
// 1. we grab and wait for the kqueue's waitq directly in the above `poll_wait` call
// 2. we don't even use the uthread's waitq_set, so it's useless for kqueue to attach its waitq onto it
result = kqueue_select(kqf, FREAD, NULL);
dkqueue_log("is kqueue already active? %s", result ? "yes" : "no");
return (result > 0) ? (POLLIN | POLLRDNORM) : 0;
};
static int dkqueue_release(struct inode* inode, struct file* file) {
struct kqfile* kqf = (struct kqfile*)file->private_data;
proc_t proc;
dkqueue_list_entry_t* curr;
dkqueue_list_entry_t* temp;
if (!kqf) {
// `dkqueue_clear` already cleaned us up
return 0;
}
proc = kqf->kqf_p;
dkqueue_log("Linux file pointer %p for kqueue %p released", file, kqf);
proc_fdlock(proc);
LIST_FOREACH_SAFE(curr, &proc->p_fd->kqueue_list, link, temp) {
if (curr->kq == kqf) {
dkqueue_log("releasing kqueue %p", kqf);
LIST_REMOVE(curr, link);
if (kqf->kqf_kqueue.dkq_fs_group) {
dkqueue_log("destroying fsnotify group for kqueue");
fsnotify_put_group(kqf->kqf_kqueue.dkq_fs_group);
}
// XNU always calls `drain` then `close`; let's do likewise
kqueue_drain(kqf);
// `kqueue_close` takes `proc_fdlock`, so make sure it's unlocked
proc_fdunlock(proc);
kqueue_close(kqf);
proc_fdlock(proc);
kmem_cache_free(dkqueue_list_entry_cache, curr);
break;
}
}
proc_fdunlock(proc);
return 0;
};
// i don't know why `fget_task` isn't exported in Linux!
// (we shouldn't use `fget`, because that's only for the current process, and we might not be dealing with the current process)
// prevent XNU macros from overriding Linux inline functions
#undef task_lock
#undef task_unlock
// <copied from="linux://5.8.14/fs/file.c" modified="true">
static struct file *__fget_files(struct files_struct *files, unsigned int fd,
fmode_t mask, unsigned int refs)
{
struct file *file;
rcu_read_lock();
loop:
file = darling_fcheck_files(files, fd);
if (file) {
/* File object ref couldn't be taken.
* dup2() atomicity guarantee is the reason
* we loop to catch the new file (or NULL pointer)
*/
if (file->f_mode & mask)
file = NULL;
else if (!get_file_rcu_many(file, refs))
goto loop;
}
rcu_read_unlock();
return file;
}
#ifdef __DARLING__
static struct file *fget_task_noconflict(struct task_struct *task, unsigned int fd)
#else
struct file *fget_task(struct task_struct *task, unsigned int fd)
#endif
{
struct file *file = NULL;
task_lock(task);
if (task->files)
file = __fget_files(task->files, fd, 0, 1);
task_unlock(task);
return file;
}
// </copied>
int darling_fd_lookup(proc_t proc, int fd, struct file** out_file) {
struct file* tmp = fget_task_noconflict(((task_t)proc->task)->map->linux_task, fd);
if (!tmp) {
return EBADF;
}
if (out_file) {
*out_file = tmp;
}
return 0;
};
/**
* @brief Check if `events` are enabled for the given knote
* @param kn The knote to check
* @param events The events to check for
* @param all_or_nothing If `true`, this function will only return `true` if *all* the events are present
*/
static bool knote_wants_events(struct knote* kn, int events, bool all_or_nothing) {
return all_or_nothing ? ((kn->kn_sfflags & events) == events) : ((kn->kn_sfflags & events) != 0);
};
//
// file descriptor filter
//
// alright, so, the way we do this is kind of a hack.
// (but not really; it looks like Linux's APIs took into account the possibility of someone wanting to do this)
//
// we essentially mimic what Linux `poll` and `select` do on files.
// we call `file_operation`'s `poll` for the file and hand it our own `poll_table`.
// it's not a regular `poll_table`, though; it's extended to contain a pointer to the knote
// so that our `poll_queue_proc` replacement can connect the Linux waitqueues it gets with the XNU knote
static void dkqueue_file_init(void) {
container_cache = kmem_cache_create("dkqueue_ptc_t", sizeof(dkqueue_ptc_t), 0, 0, NULL);
entry_cache = kmem_cache_create("dkqueue_pte_t", sizeof(dkqueue_pte_t), 0, 0, NULL);
file_activation_wq = create_singlethread_workqueue("dkqueue_activator");
};
static int dkqueue_file_peek_with_poll_result(struct knote* kn, __poll_t poll_result) {
int result = 0;
dkqueue_ptc_t* ptc = kn->kn_hook;
if ((poll_result & POLLHUP) != 0 || (poll_result & POLLRDHUP) != 0 || (poll_result & POLLERR) != 0) {
// possible bug: are we supposed to set EV_EOF for EVFILT_SOCK?
result = 1;
kn->kn_flags |= EV_EOF;
}
switch (kn->kn_filter) {
case EVFILT_READ: {
if ((poll_result & (POLLIN | POLLRDNORM)) != 0) {
dkqueue_log("read filter can be activated");
result = 1;
}
if ((poll_result & (POLLPRI | POLLRDBAND)) != 0) {
kn->kn_flags |= EV_OOBAND;
}
} break;
case EVFILT_WRITE: {
if ((poll_result & (POLLOUT | POLLWRNORM)) != 0) {
dkqueue_log("write filter can be activated");
result = 1;
}
} break;
case EVFILT_EXCEPT: {
if ((poll_result & POLLPRI) != 0) {
dkqueue_log("exception filter can be activated");
result = 1;
}
} break;
#if 0
// TODO: we have the most interesting socket notifications implemented,
// but it'd be great to have all of them implemented
case EVFILT_SOCK: {
// we don't need to check for failure; we already checked this is a socket in `dkqueue_file_attach`
struct socket* sock = sock_from_file(kn->kn_fp, &result);
int updated = 0;
if (!ptc->connected && (ptc->connected = (sock->state & SS_CONNECTED) != 0)) {
// we weren't connected, but we are now
updated |= NOTE_CONNECTED;
} else if (ptc->connected && (ptc->connected = (sock->state & SS_CONNECTED) == 0)) {
// we were connected, but we aren't now
updated |= NOTE_DISCONNECTED;
}
if ((updated & (NOTE_CONNECTED | NOTE_DISCONNECTED)) != 0) {
// if the state changed to either connected or disconnected, the connection info was updated
updated |= NOTE_CONNINFO_UPDATED;
}
// try to get some more specific information
if (sock->sk) {
if (!ptc->send_shutdown && (ptc->send_shutdown = (sock->sk->sk_shutdown & SEND_SHUTDOWN) != 0)) {
// we can no longer send data on this socket
updated |= NOTE_WRITECLOSED;
}
if (!ptc->receive_shutdown && (ptc->receive_shutdown = (sock->sk->sk_shutdown & RCV_SHUTDOWN) != 0)) {
// we can no longer receive data on this socket
updated |= NOTE_READCLOSED;
}
}
// only deliver events the user asked for
updated &= kn->kn_sfflags & EVFILT_SOCK_ALL_MASK;
if (updated == 0) {
// if the user didn't want any of our events, don't report ourselves as active
result = 0;
} else {
// otherwise, if the user *did* want our events; set them and report ourselves as active
kn->kn_fflags |= updated;
result = 1;
}
} break;
#endif
};
return result;
};
static void dkqueue_poll_unsubscribe_all(dkqueue_ptc_t* ptc) {
dkqueue_log("unsubscribing from wait queues on ptc %p", ptc);
while (!SLIST_EMPTY(&ptc->pte_head)) {
dkqueue_pte_t* pte = SLIST_FIRST(&ptc->pte_head);
dkqueue_log("unsubscribing from wait queue %p", pte->wqh);
// remove our wait queue subscription
remove_wait_queue(pte->wqh, &pte->wqe);
// remove it from the list
SLIST_REMOVE_HEAD(&ptc->pte_head, link);
// finally, deallocate it
kmem_cache_free(entry_cache, pte);
}
};
static int dkqueue_perform_default_wakeup(wait_queue_entry_t* wqe, unsigned int mode, int sync, void* context) {
dkqueue_ptc_t* ptc = wqe->private;
struct kqueue* kq = knote_get_kq(ptc->kn);
DECLARE_WAITQUEUE(dummy_wait, ptc->polling_task);
// see `__pollwake` in Linux's `fs/select.c`
smp_wmb();
ptc->triggered = true;
return default_wake_function(&dummy_wait, mode, sync, context);
};
static int dkqueue_poll_wakeup_callback(wait_queue_entry_t* wqe, unsigned int mode, int sync, void* context) {
dkqueue_pte_t* pte = (dkqueue_pte_t*)wqe;
dkqueue_ptc_t* ptc = wqe->private;
__poll_t poll_result;
dkqueue_log("wait queue for file %p woke up", ptc->kn->kn_fp);
// imitate `poll` here:
// we already subscribed to all the wait queues the driver told us to subscribe to when we called it in `dkqueue_file_attach`,
// so now we disable the `poll_wait` callback and just keep the queues we have now
if (ptc->pt._qproc != NULL) {
ptc->pt._qproc = NULL;
} else {
// again, see `__pollwake` in Linux's `fs/select.c`
smp_store_mb(ptc->triggered, false);
}
// check again to make sure there's really an event for us
poll_result = vfs_poll(ptc->kn->kn_fp, &ptc->pt);
if (dkqueue_file_peek_with_poll_result(ptc->kn, poll_result)) {
dkqueue_log("activating knote %p for file-based filter with file %p", ptc->kn, ptc->kn->kn_fp);
// queue an activator for the knote
queue_work(file_activation_wq, &ptc->activator);
}
return dkqueue_perform_default_wakeup(wqe, mode, sync, context);
};
// follows along the same lines of what `__pollwait` in Linux (in `fs/select.c`) does
static void dkqueue_poll_wait_callback(struct file* file, wait_queue_head_t* wqh, poll_table* pt) {
dkqueue_ptc_t* ptc = (dkqueue_ptc_t*)pt;
dkqueue_log("subscribing to wait queue %p for file %p", wqh, file);
// allocate a pte
dkqueue_pte_t* pte = kmem_cache_alloc(entry_cache, GFP_KERNEL);
memset(pte, 0, sizeof(dkqueue_pte_t));
SLIST_INSERT_HEAD(&ptc->pte_head, pte, link);
// intialize the pte
init_waitqueue_func_entry(&pte->wqe, dkqueue_poll_wakeup_callback);
pte->wqh = wqh;
pte->wqe.private = ptc;
// subscribe to the wait queue we were given
add_wait_queue(wqh, &pte->wqe);
};
static void dkqueue_file_activate(struct work_struct* work) {
dkqueue_ptc_t* ptc = container_of(work, dkqueue_ptc_t, activator);
struct kqueue* kq = knote_get_kq(ptc->kn);
// tell kqueue we're awake
kqlock(kq);
knote_activate(kq, ptc->kn, FILTER_ACTIVE);
kqunlock(kq);
};
static int dkqueue_file_attach(struct knote* kn, struct kevent_qos_s* kev) {
int result;
struct file* file = kn->kn_fp;
struct kqueue* kq;
dkqueue_ptc_t* ptc;
__poll_t poll_result;
#if 0
struct socket* sock = NULL;
#endif
bool connected = false;
bool send_shutdown = false;
bool receive_shutdown = false;
dkqueue_log("attaching a file-based filter onto file %p", file);
kq = knote_get_kq(kn);
#if 0
// if we're asked to report socket events, we need to know its initial status
// (polling a socket doesn't tell us what its previous status was)
// we check up here as soon as we can in case the state changes after the call to `vfs_poll` (or even when allocating the ptc)
if (kn->kn_filter == EVFILT_SOCK) {
sock = sock_from_file(file, &result);
if (!sock) {
// don't even bother checking `result`; `sock_from_file` only ever returns EBADF
knote_set_error(kn, EBADF);
return 0;
}
connected = (sock->state & SS_CONNECTED) != 0;
if (sock->sk) {
send_shutdown = (sock->sk->sk_shutdown & SEND_SHUTDOWN) != 0;
receive_shutdown = (sock->sk->sk_shutdown & RCV_SHUTDOWN) != 0;
}
}
#else
// we need to resolve some header conflicts in order to include Linux's socket headers
if (kn->kn_filter == EVFILT_SOCK) {
knote_set_error(kn, ENOTSUP);
return 0;
}
#endif
// allocate a ptc
ptc = kmem_cache_alloc(container_cache, GFP_KERNEL);
memset(ptc, 0, sizeof(dkqueue_ptc_t));
kn->kn_hook = ptc;
// initialize the ptc
ptc->polling_task = linux_current;
init_poll_funcptr(&ptc->pt, dkqueue_poll_wait_callback);
ptc->kn = kn;
SLIST_INIT(&ptc->pte_head);
ptc->connected = connected;
ptc->send_shutdown = send_shutdown;
ptc->receive_shutdown = receive_shutdown;
INIT_WORK(&ptc->activator, dkqueue_file_activate);
if (!file_can_poll(file)) {
dkqueue_log("file is always active");
// if the file doesn't have a `poll` operation, that means it's always ready.
// how nice of kqueue to have something for just this occasion :)
// kqueue is still going to try `f_peek`ing us, though (but we'll just always tell it we have data for it)
knote_markstayactive(kn);
return 1;
} else {
// otherwise, the file *can* be polled, so do so
poll_result = vfs_poll(file, &ptc->pt);
}
// check if the filter is already active
result = dkqueue_file_peek_with_poll_result(kn, poll_result);
dkqueue_log("file already active? %s", result ? "yes" : "no");
return result;
};
static void dkqueue_file_detach(struct knote* kn) {
dkqueue_ptc_t* ptc = kn->kn_hook;
dkqueue_log("detaching from file %p", kn->kn_fp);
// we might get called if `dkqueue_file_attach` fails, so we might not have a ptc yet
if (!ptc) {
return;
}
// unsubscribe from all wait queues and destroy the `pte`s
dkqueue_poll_unsubscribe_all(ptc);
// make sure there aren't any more activators running
cancel_work_sync(&ptc->activator);
// deallocate the ptc
kmem_cache_free(container_cache, ptc);
};
static int dkqueue_file_event(struct knote* kn, long hint) {
// TODO
return 0;
};
static int dkqueue_file_touch(struct knote* kn, struct kevent_qos_s* kev) {
kn->kn_sfflags = kev->fflags;
return 0;
};
static int dkqueue_file_process(struct knote* kn, struct kevent_qos_s* kev) {
__poll_t poll_result;
dkqueue_ptc_t* ptc = kn->kn_hook;
struct file* file = kn->kn_fp;
int64_t data = 0;
// make sure we still have an event one last time
if (file_can_poll(file)) {
dkqueue_log("making sure file-based filter for %p still has events...", file);
// see `dkqueue_poll_wakeup_callback`
if (ptc->pt._qproc != NULL) {
ptc->pt._qproc = NULL;
} else {
smp_store_mb(ptc->triggered, false);
}
poll_result = vfs_poll(file, &ptc->pt);
// no events
if (!dkqueue_file_peek_with_poll_result(kn, poll_result)) {
dkqueue_log("file-based filter had events, but not anymore");
return 0;
}
}
// if we made it here, we do have events available
switch (kn->kn_filter) {
case EVFILT_READ: {
if (vfs_ioctl(file, FIONREAD, &data) < 0) {
data = 0;
dkqueue_log("failed to determine available bytes for EVFILT_READ");
}
dkqueue_log("found %ld bytes to read", data);
} break;
case EVFILT_WRITE: {
if (vfs_ioctl(file, TIOCOUTQ, &data) < 0) {
data = 0;
dkqueue_log("failed to determine available bytes for EVFILT_WRITE");
}
dkqueue_log("found %ld bytes to write", data);
};
}
knote_fill_kevent(kn, kev, data);
return 1;
};
static int dkqueue_file_peek(struct knote* kn) {
dkqueue_log("file-based filter for %p is being peeked at", kn->kn_fp);
// this function is only ever called when our file can't poll.
// so as long as the file exists, we're active.
return 1;
};
const static struct filterops file_filtops = {
.f_isfd = true,
.f_attach = dkqueue_file_attach,
.f_detach = dkqueue_file_detach,
.f_event = dkqueue_file_event,
.f_touch = dkqueue_file_touch,
.f_process = dkqueue_file_process,
.f_peek = dkqueue_file_peek,
};
//
// process filter
//
// TODO: NOTE_REAP delivery.
// the task registry is already set up for this, but we need to actually send notifications for
// when a process is reaping another (most likely via an LKM hook injected into the various `wait()` system calls in userspace)
// actually, TODO: a couple more NOTEs
static void dkqueue_proc_init(void) {
proc_context_cache = kmem_cache_create("dkq_proc_context", sizeof(dkqueue_proc_context_t), 0, 0, NULL);
};
static void dkqueue_proc_listener(int pid, void* context, darling_proc_event_t event, unsigned long extra) {
dkqueue_proc_context_t* ctx = context;
struct knote* kn = ctx->kn;
struct kqueue* kq = knote_get_kq(kn);
uint32_t* saved_data;
// since we don't actually save a process to `kn_proc`,
// we use it to save our data (we can't use kn_hook32 because we already assign our context to kn_hook)
saved_data = (uint32_t*)&kn->kn_proc;
*saved_data = 0;
dkqueue_log("received event %d from Darling's process notification system for Linux PID %d with extra=%ld", event, pid, extra);
switch (event) {
case DTE_EXITED: {
kn->kn_fflags |= NOTE_EXIT;
if ((kn->kn_sfflags & NOTE_EXITSTATUS) != 0) {
*saved_data = extra & NOTE_PDATAMASK;
}
// we won't get any more events; destroy our context
// note that we don't need to call `darling_proc_notify_deregister` (because won't get called again)
kn->kn_hook = NULL;
kmem_cache_free(proc_context_cache, ctx);
// also, since we won't get any more events, we can go ahead and tell the user that we've reached EOF
// (we do it even if they didn't ask to know about NOTE_EXIT; this is how XNU does it)
//
// BUG: XNU checks that either the event is NOTE_REAP or the event is NOTE_EXIT AND the user hasn't requested NOTE_REAP.
// since we don't have NOTE_REAP implemented yet, we always remove it on NOTE_EXIT.
// TODO: actually implement NOTE_REAP and check if the user wants that before telling them we're done.
kn->kn_flags |= EV_EOF | EV_ONESHOT;
dkqueue_log("process is exiting; marking knote as done");
} break;
case DTE_FORKED: {
kn->kn_fflags |= NOTE_FORK;
// XNU doesn't seem to pass the child PID through to userspace?
//*saved_data = extra & NOTE_PDATAMASK;
} break;
case DTE_REPLACED: {
kn->kn_fflags |= NOTE_EXEC;
} break;
}
// ignore events the user doesn't want
kn->kn_fflags &= kn->kn_sfflags;
if (kn->kn_fflags != 0) {
dkqueue_log("activating process filter knote %p on PID %ld", kn, kn->kn_id);
// tell kqueue we've got some events for it
kqlock(kq);
knote_activate(kq, kn, FILTER_ACTIVE);
kqunlock(kq);
}
};
static int dkqueue_proc_attach(struct knote* kn, struct kevent_qos_s* kev) {
struct pid* linux_pid = NULL;
int xnu_pid = 0;
dkqueue_proc_context_t* ctx = NULL;
// NOTE: XNU has completely removed support for these notes.
// do we want to implement them in Darling for backwards compat?
if ((kn->kn_sfflags & (NOTE_TRACK | NOTE_TRACKERR | NOTE_CHILD)) != 0) {
knote_set_error(kn, ENOTSUP);
goto error_out;
}
// TODO: XNU's EVFILT_PROC checks to make sure the process asking to listen to another process
// actually has permission to do so (i.e. it's a parent or a debugger)
kn->kn_flags |= EV_CLEAR;
kn->kn_sdata = 0;
kn->kn_fflags = 0;
rcu_read_lock();
linux_pid = find_vpid(kn->kn_id);
if (!linux_pid) {
rcu_read_unlock();
knote_set_error(kn, ESRCH);
dkqueue_log("failed to find XNU task for Darling PID %ld", kn->kn_id);
goto error_out;
}
xnu_pid = pid_nr(linux_pid);
rcu_read_unlock();
dkqueue_log("attaching process filter to process with Darling PID %ld and Linux PID %d", kn->kn_id, xnu_pid);
ctx = kmem_cache_alloc(proc_context_cache, GFP_KERNEL);
if (!ctx) {
knote_set_error(kn, ENOMEM);
goto error_out;
}
memset(ctx, 0, sizeof(dkqueue_proc_context_t));
kn->kn_hook = ctx;
ctx->kn = kn;
ctx->xnu_pid = xnu_pid;
ctx->registration_id = darling_proc_notify_register(xnu_pid, dkqueue_proc_listener, ctx, DTE_ALL_EVENTS);
if (ctx->registration_id < 0) {
// `darling_proc_notify_register` currently only fails if it didn't find the task
knote_set_error(kn, ESRCH);
dkqueue_log("failed to attach a listener onto process with Darling PID %ld", kn->kn_id);
goto error_out;
}
// we *might* already have events
return kn->kn_fflags != 0;
error_out:
if (ctx) {
if (ctx->registration_id >= 0) {
darling_proc_notify_deregister(xnu_pid, ctx->registration_id);
}
kmem_cache_free(proc_context_cache, ctx);
}
return 0;
};
static void dkqueue_proc_detach(struct knote* kn) {
dkqueue_proc_context_t* ctx = kn->kn_hook;
// the task we were watching might've died before we detached.
// in that case, our event listener would already have cleaned up for us.
if (!ctx) {
return;
}
dkqueue_log("detaching process filter from process with Linux PID %d", ctx->xnu_pid);
darling_proc_notify_deregister(ctx->xnu_pid, ctx->registration_id);
kmem_cache_free(proc_context_cache, ctx);
};
static int dkqueue_proc_event(struct knote* kn, long hint) {
return kn->kn_fflags != 0;
};
static int dkqueue_proc_touch(struct knote* kn, struct kevent_qos_s* kev) {
kn->kn_sfflags = kev->fflags;
// XNU explicitly does NOT restrict the current events with the new flags, for backwards compatibility
//kn->kn_fflags &= kn->kn_sfflags;
return kn->kn_fflags != 0;
};
static int dkqueue_proc_process(struct knote* kn, struct kevent_qos_s* kev) {
uint32_t* saved_data = (uint32_t*)&kn->kn_proc;
if (kn->kn_fflags == 0) {
// we've got no events ready
return 0;
}
dkqueue_log("filling kevent data from process filter knote %p", kn);
knote_fill_kevent(kn, kev, *saved_data);
*saved_data = 0;
return 1;
};
const static struct filterops proc_filtops = {
.f_attach = dkqueue_proc_attach,
.f_detach = dkqueue_proc_detach,
.f_event = dkqueue_proc_event,
.f_touch = dkqueue_proc_touch,
.f_process = dkqueue_proc_process,
};
//
// dkqueue-specific vnode filter
//
// on XNU, EVFILT_VNODE just refers to file_filtops, but for us, it's easier to implement a separate filter
// because we have to listen for events very differently than we do for regular files
//
// for our entire implementation, we'll be referring to Linux's implementation of inotify on top of fsnotify
// for the correlation between fsnotify events and EVFILT_VNODE notes,
// see XNU's `tests/kqueue_file_tests.c` to see the expected triggers of each EVFILT_VNODE note
static void dkqueue_vnode_init(void) {
vnode_context_cache = kmem_cache_create("dkq_vnode_ctx", sizeof(dkqueue_vnode_context_t), 0, 0, NULL);
};
#if LINUX_VERSION_CODE >= KERNEL_VERSION(5,9,0)
static int dkqueue_vnode_fs_handle_event(struct fsnotify_group *group, u32 mask, const void *data, int data_type, struct inode *dir, const struct qstr* file_name, u32 cookie, struct fsnotify_iter_info* iter_info) {
#else
static int dkqueue_vnode_fs_handle_event(struct fsnotify_group* group, struct inode* inode, u32 mask, const void* data, int data_type, const struct qstr* file_name, u32 cookie, struct fsnotify_iter_info* iter_info) {
#endif
struct fsnotify_mark* fs_mark = fsnotify_iter_inode_mark(iter_info);
dkqueue_vnode_context_t* ctx = (dkqueue_vnode_context_t*)fs_mark;
uint32_t vnode_event_mask = 0;
struct kqueue* kq = fs_mark->group->private;
#if LINUX_VERSION_CODE >= KERNEL_VERSION(5,9,0)
struct inode* inode = fsnotify_data_inode(data, data_type);
if (!inode) {
dkqueue_log("received fsnotify event without an inode; ignoring...");
return 0;
}
#endif
dkqueue_log("received fsnotify event for vnode filter");
if ((mask & FS_DELETE_SELF) != 0) {
vnode_event_mask |= NOTE_DELETE;
}
if ((mask & FS_MOVE_SELF) != 0) {
vnode_event_mask |= NOTE_RENAME;
}
if ((mask & FS_UNMOUNT) != 0) {
vnode_event_mask |= NOTE_REVOKE;
}
if ((mask & FS_CREATE) != 0) {
// it's not clear from `kqueue_file_tests.c` whether creating a directory in a watched directory
// produces both NOTE_WRITE and NOTE_LINK, or just NOTE_LINK.
// for now, we assume it produces both.
// TODO: test this on a mac and determine the proper behavior.
vnode_event_mask |= NOTE_WRITE | ((mask & FS_ISDIR) != 0 ? NOTE_LINK : 0);
}
if ((mask & FS_MOVE) != 0) {
// same TODO as with FS_CREATE
vnode_event_mask |= NOTE_WRITE | ((mask & FS_ISDIR) != 0 ? NOTE_LINK : 0);
}
if ((mask & FS_DELETE) != 0) {
// same TODO as with FS_CREATE
vnode_event_mask |= NOTE_WRITE | ((mask & FS_ISDIR) != 0 ? NOTE_LINK : 0);
}
if ((mask & FS_MODIFY) != 0) {
vnode_event_mask |= NOTE_WRITE;
// make sure the file has actually been extended
if (ctx->cached_size > (ctx->cached_size = i_size_read(inode))) {
vnode_event_mask |= NOTE_EXTEND;
}
}
if ((mask & FS_ATTRIB) != 0) {
// BUG: NOTE_ATTRIB should NOT be set if only the inode's link count has changed (that's what NOTE_LINK is for)
// ...but how do we determine if that's the *only* thing that changed?
// we *could* cache all the other possible attributes and check if they've changed, but there *must* be a better way
vnode_event_mask |= NOTE_ATTRIB;
if (ctx->cached_nlink != (ctx->cached_nlink = inode->i_nlink)) {
vnode_event_mask |= NOTE_LINK;
}
}
vnode_context_lock(ctx);
ctx->kn->kn_fflags |= vnode_event_mask & ctx->kn->kn_sfflags;
if (ctx->kn->kn_fflags != 0) {
dkqueue_log("activating vnode filter knote %p", ctx->kn);
kqlock(kq);
knote_activate(kq, ctx->kn, FILTER_ACTIVE);
kqunlock(kq);
}
vnode_context_unlock(ctx);
return 0;
};
static void dkqueue_vnode_free_group(struct fsnotify_group *group) {
// remove the group from the kqueue
// this shouldn't be necessary, however, as the only time the group should be dying is due to `dkqueue_release`
// we normally don't "get" or "put" the group because we keep it alive as long as the kqueue is alive
struct kqueue* kq = group->private;
kq->dkq_fs_group = NULL;
};
#if LINUX_VERSION_CODE < KERNEL_VERSION(5,16,0)
static void dkqueue_vnode_fs_free_event(struct fsnotify_event* event) {
#else
static void dkqueue_vnode_fs_free_event(struct fsnotify_group* group, struct fsnotify_event* event) {
#endif
// unused but required by fsnotify
};
static void dkqueue_vnode_fs_free_mark(struct fsnotify_mark* mark) {
dkqueue_vnode_context_t* ctx = (dkqueue_vnode_context_t*)mark;
kmem_cache_free(vnode_context_cache, ctx);
};
/**
* @brief Update the fsnotify mark with the events we want to listen for.
*
* Called with the vnode context (`ctx`) locked
*/
static void dkqueue_vnode_update_wanted_events(dkqueue_vnode_context_t* ctx) {
ctx->fs_mark.mask = 0;
#define VNODE_TRANSLATE(a, b) if ((ctx->kn->kn_sfflags & (a)) != (a)) { ctx->fs_mark.mask |= (b); }
VNODE_TRANSLATE(NOTE_DELETE, FS_DELETE_SELF);
VNODE_TRANSLATE(NOTE_WRITE, FS_MODIFY | ALL_FSNOTIFY_DIRENT_EVENTS);
VNODE_TRANSLATE(NOTE_EXTEND, FS_MODIFY);
VNODE_TRANSLATE(NOTE_ATTRIB, FS_ATTRIB);
VNODE_TRANSLATE(NOTE_LINK, FS_ATTRIB | ALL_FSNOTIFY_DIRENT_EVENTS);
VNODE_TRANSLATE(NOTE_RENAME, FS_MOVE_SELF);
VNODE_TRANSLATE(NOTE_REVOKE, FS_UNMOUNT); // not sure how we're supposed to report revoke(2) usage
VNODE_TRANSLATE(NOTE_NONE, 0); // ???
VNODE_TRANSLATE(NOTE_FUNLOCK, 0); // not sure how we're supposed to detect this (Darling userspace hook and hope that Linux doesn't touch the file?)
#undef VNODE_TRANSLATE
};
static void vnode_context_lock(dkqueue_vnode_context_t* ctx) {
// group->mark_mutex prevents the group from trying to use our mark while we're modifying it
// fs_mark->lock prevents someone else from modifying our mark while we are
mutex_lock(&ctx->fs_mark.group->mark_mutex);
spin_lock(&ctx->fs_mark.lock);
};
static void vnode_context_unlock(dkqueue_vnode_context_t* ctx) {
spin_unlock(&ctx->fs_mark.lock);
mutex_unlock(&ctx->fs_mark.group->mark_mutex);
};
static int dkqueue_vnode_attach(struct knote* kn, struct kevent_qos_s* kev) {
struct kqueue* kq = knote_get_kq(kn);
struct fsnotify_group* group = NULL;
dkqueue_vnode_context_t* ctx = NULL;
struct fsnotify_mark* fs_mark = NULL;
struct inode* inode = file_inode(kn->kn_fp);
// use the existing fsnotify group if there is one
if (kq->dkq_fs_group) {
dkqueue_log("reusing fsnotify group %p for kqueue %p", kq->dkq_fs_group, kq);
group = kq->dkq_fs_group;
//fsnotify_get_group(group);
} else {
// otherwise, create it
dkqueue_log("creating an fsnotify group for kqueue %p", kq);
kq->dkq_fs_group = group = fsnotify_alloc_group(&dkqueue_vnode_fsops);
if (IS_ERR(group)) {
// `fsnotify_alloc_group` only ever fails due to ENOMEM
knote_set_error(kn, ENOMEM);
kq->dkq_fs_group = NULL;
dkqueue_log("failed to allocate fsnotify group");
goto error_out;
}
group->private = kq;
}
ctx = kmem_cache_alloc(vnode_context_cache, GFP_KERNEL);
if (!ctx) {
knote_set_error(kn, ENOMEM);
dkqueue_log("failed to allocate fsnotify mark context");
goto error_out;
}
memset(ctx, 0, sizeof(dkqueue_vnode_context_t));
fsnotify_init_mark(&ctx->fs_mark, group);
fs_mark = &ctx->fs_mark;
// attach it onto the node (`true` = allow duplicates).
// allowing duplicates greatly simplifies mark management but also means we can't use `fsnotify_find_mark` to find our mark.
fsnotify_add_inode_mark(fs_mark, inode, true);
ctx->cached_size = i_size_read(inode);
ctx->cached_nlink = inode->i_nlink;
// the event listener callback will check our knote for the events we want to listen for and activate us when it gets them,
// so we don't need to do anything except make our knote accessible to it and update fsnotify about the events our mark wants
ctx->kn = kn;
kn->kn_hook = ctx;
vnode_context_lock(ctx);
dkqueue_vnode_update_wanted_events(ctx);
vnode_context_unlock(ctx);
// as long as our listener exists, we hold a reference to the group and another to the mark
return 0;
error_out:
if (fs_mark) {
// will free `ctx` for us
fsnotify_put_mark(fs_mark);
}
//if (group) {
// fsnotify_put_group(group);
//}
return 0;
};
static void dkqueue_vnode_detach(struct knote* kn) {
dkqueue_vnode_context_t* ctx = kn->kn_hook;
dkqueue_log("detaching vnode filter with mark %p", &ctx->fs_mark);
fsnotify_put_mark(&ctx->fs_mark);
};
static int dkqueue_vnode_event(struct knote* kn, long hint) {
// we don't take hints
return kn->kn_fflags != 0;
};
static int dkqueue_vnode_touch(struct knote* kn, struct kevent_qos_s* kev) {
dkqueue_vnode_context_t* ctx = kn->kn_hook;
kn->kn_sfflags = kev->fflags;
vnode_context_lock(ctx);
dkqueue_vnode_update_wanted_events(ctx);
kn->kn_fflags &= kn->kn_sfflags; // ignore queued events that the user is no longer interested in
vnode_context_unlock(ctx);
return kn->kn_fflags != 0;
};
static int dkqueue_vnode_process(struct knote* kn, struct kevent_qos_s* kev) {
// `dkqueue_vnode_fs_handle_event` does all the processing for us
return kn->kn_fflags != 0;
};
static int dkqueue_vnode_peek(struct knote* kn) {
return kn->kn_fflags != 0;
};
const static struct filterops dkqueue_vnode_filtops = {
.f_isfd = true,
.f_attach = dkqueue_vnode_attach,
.f_detach = dkqueue_vnode_detach,
.f_event = dkqueue_vnode_event,
.f_touch = dkqueue_vnode_touch,
.f_process = dkqueue_vnode_process,
.f_peek = dkqueue_vnode_peek,
};
//
// other functions
//
// used to close kqueues on fork
static void dkqueue_fork_listener(int pid, void* context, darling_proc_event_t event, unsigned long extra) {
task_t task = darling_task_get(pid);
proc_t parent_proc = get_bsdtask_info(task);
dkqueue_list_entry_t* curr;
// we're called in the execution context of the child, so we can just use `ksys_close` to close the child's fds for the parent's kqueues
proc_fdlock(parent_proc);
LIST_FOREACH(curr, &parent_proc->p_fd->kqueue_list, link) {
dkqueue_log("closing kqueue with fd %d on fork", curr->fd);
proc_fdunlock(parent_proc);
darling_close_fd(curr->fd);
proc_fdlock(parent_proc);
}
proc_fdunlock(parent_proc);
task_deallocate(task);
};
static void dkqueue_list_init(void) {
dkqueue_list_entry_cache = kmem_cache_create("dkq_le_cache", sizeof(dkqueue_list_entry_t), 0, 0, NULL);
};
//
// functions for the rest of the LKM
//
void dkqueue_init() {
dkqueue_file_init();
dkqueue_proc_init();
dkqueue_list_init();
dkqueue_vnode_init();
};
void dkqueue_clear(struct proc* proc) {
struct filedesc* fdp = proc->p_fd;
struct dkqueue_list* list = &fdp->kqueue_list;
struct file* curr_file;
while (!LIST_EMPTY(list)) {
dkqueue_list_entry_t* curr = LIST_FIRST(list);
LIST_REMOVE(curr, link);
// nullify the file's private data so dkqueue_release doesn't do anything
curr->kq->kqf_kqueue.dkq_fp->private_data = NULL;
kqueue_drain(curr->kq);
proc_fdunlock(proc);
kqueue_close(curr->kq);
proc_fdlock(proc);
kmem_cache_free(dkqueue_list_entry_cache, curr);
}
};
int darling_kqueue_create(struct task* task) {
proc_t proc = PROC_NULL;
struct kqfile* kq = NULL;
int fd = -1;
int result = 0;
dkqueue_list_entry_t* le = NULL;
dkqueue_log("creating kqueue on task %p", task);
proc = (proc_t)get_bsdtask_info(task);
if (!proc) {
// we can't create a kqueue for a plain Mach task.
// this check is kind of useless at the moment, though,
// since it's currently impossible to create a plain Mach task in Darling
result = -LINUX_EINVAL;
dkqueue_log("failed to create kqueue on plain Mach task");
goto error_out;
}
// it says it allocates a kqueue, but it's actually a kqfile
kq = (struct kqfile*)kqueue_alloc(proc);
if (!kq) {
result = -LINUX_ENOMEM;
dkqueue_log("failed to allocate kqueue");
goto error_out;
}
le = kmem_cache_alloc(dkqueue_list_entry_cache, GFP_KERNEL);
if (!le) {
result = -LINUX_ENOMEM;
dkqueue_log("failed to allocate list entry for kqueue list");
goto error_out;
}
memset(le, 0, sizeof(dkqueue_list_entry_t));
le->kq = kq;
fd = anon_inode_getfd("kqueue", &dkqueue_ops, kq, O_RDONLY | O_CLOEXEC);
if (fd < 0) {
result = fd;
dkqueue_log("failed to get anon fd for kqueue");
goto error_out;
}
le->fd = fd;
// we use `fcheck` because we DON'T want to increment the file's refcount
// this pointer will only be used as long as the file is open anyways,
// since the kqueue is alive as long as the file is
rcu_read_lock();
kq->kqf_kqueue.dkq_fp = fcheck(fd);
rcu_read_unlock();
// attach it onto the list of open kqueues
proc_fdlock(proc);
LIST_INSERT_HEAD(&proc->p_fd->kqueue_list, le, link);
proc_fdunlock(proc);
dkqueue_log("created kqueue %p on fd %d", kq, fd);
proc_lock(proc);
if (proc->kqueue_fork_listener_id < 0) {
dkqueue_log("setting up fork listener for process");
proc->kqueue_fork_listener_id = darling_proc_notify_register(task->map->linux_task->tgid, dkqueue_fork_listener, NULL, DTE_FORKED);
if (proc->kqueue_fork_listener_id < 0) {
proc_unlock(proc);
result = -LINUX_ESRCH; // `darling_proc_notify_register` only fails with ESRCH
dkqueue_log("darling_kqueue_create: failed to set up fork listener (with status %ld)", proc->kqueue_fork_listener_id);
goto error_out;
}
}
proc_unlock(proc);
return fd;
error_out:
if (fd >= 0) {
darling_close_fd(fd);
} else {
// we only cleanup the rest ourselves if the fd still hasn't been created.
// otherwise (if it *has* been created), Linux will call `dkqueue_release` on the file
// and that will cleanup the kqueue and the list entry for us
if (kq) {
kqueue_dealloc(&kq->kqf_kqueue);
}
if (le) {
kmem_cache_free(dkqueue_list_entry_cache, le);
}
}
return result;
};
int darling_closing_descriptor(struct task* task, int fd) {
struct knote* kn;
proc_t proc = get_bsdtask_info(task);
proc_fdlock(proc);
rcu_read_lock();
if (!fcheck(fd)) {
rcu_read_unlock();
goto out;
}
rcu_read_unlock();
if (fd < proc->p_fd->fd_knlistsize) {
dkqueue_log("fd %d is closing; closing knote for it...", fd);
knote_fdclose(proc, fd);
}
out:
proc_fdunlock(proc);
return 0;
};