Merge branch 'uprobes/core' of git://git.kernel.org/pub/scm/linux/kernel/git/oleg/misc into perf/core

Improve uprobes performance by adding 'pre-filtering' support,
by Oleg Nesterov:

	# time perl -e 'syscall -1 for 1..100_000'
	real    0m0.040s
	user    0m0.027s
	sys     0m0.010s

	# perf probe -x /lib/libc.so.6 syscall
	# perf record -e probe_libc:syscall sleep 100 &

Before this series:

	# time perl -e 'syscall -1 for 1..100_000'
	real    0m1.714s
	user    0m0.103s
	sys     0m1.607s

After:

	# time perl -e 'syscall -1 for 1..100_000'
	real    0m0.037s
	user    0m0.013s
	sys     0m0.023s

Signed-off-by: Ingo Molnar <mingo@kernel.org>
This commit is contained in:
Ingo Molnar 2013-02-11 10:41:53 +01:00
commit a3d4fd7a2d
8 changed files with 459 additions and 277 deletions

View File

@ -680,8 +680,10 @@ static bool __skip_sstep(struct arch_uprobe *auprobe, struct pt_regs *regs)
if (auprobe->insn[i] == 0x66)
continue;
if (auprobe->insn[i] == 0x90)
if (auprobe->insn[i] == 0x90) {
regs->ip += i + 1;
return true;
}
break;
}

View File

@ -135,16 +135,21 @@ struct hw_perf_event {
struct { /* software */
struct hrtimer hrtimer;
};
struct { /* tracepoint */
struct task_struct *tp_target;
/* for tp_event->class */
struct list_head tp_list;
};
#ifdef CONFIG_HAVE_HW_BREAKPOINT
struct { /* breakpoint */
struct arch_hw_breakpoint info;
struct list_head bp_list;
/*
* Crufty hack to avoid the chicken and egg
* problem hw_breakpoint has with context
* creation and event initalization.
*/
struct task_struct *bp_target;
struct arch_hw_breakpoint info;
struct list_head bp_list;
};
#endif
};

View File

@ -35,13 +35,20 @@ struct inode;
# include <asm/uprobes.h>
#endif
#define UPROBE_HANDLER_REMOVE 1
#define UPROBE_HANDLER_MASK 1
enum uprobe_filter_ctx {
UPROBE_FILTER_REGISTER,
UPROBE_FILTER_UNREGISTER,
UPROBE_FILTER_MMAP,
};
struct uprobe_consumer {
int (*handler)(struct uprobe_consumer *self, struct pt_regs *regs);
/*
* filter is optional; If a filter exists, handler is run
* if and only if filter returns true.
*/
bool (*filter)(struct uprobe_consumer *self, struct task_struct *task);
bool (*filter)(struct uprobe_consumer *self,
enum uprobe_filter_ctx ctx,
struct mm_struct *mm);
struct uprobe_consumer *next;
};
@ -94,6 +101,7 @@ extern int __weak set_swbp(struct arch_uprobe *aup, struct mm_struct *mm, unsign
extern int __weak set_orig_insn(struct arch_uprobe *aup, struct mm_struct *mm, unsigned long vaddr);
extern bool __weak is_swbp_insn(uprobe_opcode_t *insn);
extern int uprobe_register(struct inode *inode, loff_t offset, struct uprobe_consumer *uc);
extern int uprobe_apply(struct inode *inode, loff_t offset, struct uprobe_consumer *uc, bool);
extern void uprobe_unregister(struct inode *inode, loff_t offset, struct uprobe_consumer *uc);
extern int uprobe_mmap(struct vm_area_struct *vma);
extern void uprobe_munmap(struct vm_area_struct *vma, unsigned long start, unsigned long end);
@ -117,6 +125,11 @@ uprobe_register(struct inode *inode, loff_t offset, struct uprobe_consumer *uc)
{
return -ENOSYS;
}
static inline int
uprobe_apply(struct inode *inode, loff_t offset, struct uprobe_consumer *uc, bool add)
{
return -ENOSYS;
}
static inline void
uprobe_unregister(struct inode *inode, loff_t offset, struct uprobe_consumer *uc)
{

View File

@ -6162,11 +6162,14 @@ perf_event_alloc(struct perf_event_attr *attr, int cpu,
if (task) {
event->attach_state = PERF_ATTACH_TASK;
if (attr->type == PERF_TYPE_TRACEPOINT)
event->hw.tp_target = task;
#ifdef CONFIG_HAVE_HW_BREAKPOINT
/*
* hw_breakpoint is a bit difficult here..
*/
if (attr->type == PERF_TYPE_BREAKPOINT)
else if (attr->type == PERF_TYPE_BREAKPOINT)
event->hw.bp_target = task;
#endif
}

View File

@ -27,6 +27,7 @@
#include <linux/pagemap.h> /* read_mapping_page */
#include <linux/slab.h>
#include <linux/sched.h>
#include <linux/export.h>
#include <linux/rmap.h> /* anon_vma_prepare */
#include <linux/mmu_notifier.h> /* set_pte_at_notify */
#include <linux/swap.h> /* try_to_free_swap */
@ -41,58 +42,31 @@
#define MAX_UPROBE_XOL_SLOTS UINSNS_PER_PAGE
static struct rb_root uprobes_tree = RB_ROOT;
/*
* allows us to skip the uprobe_mmap if there are no uprobe events active
* at this time. Probably a fine grained per inode count is better?
*/
#define no_uprobe_events() RB_EMPTY_ROOT(&uprobes_tree)
static DEFINE_SPINLOCK(uprobes_treelock); /* serialize rbtree access */
#define UPROBES_HASH_SZ 13
/*
* We need separate register/unregister and mmap/munmap lock hashes because
* of mmap_sem nesting.
*
* uprobe_register() needs to install probes on (potentially) all processes
* and thus needs to acquire multiple mmap_sems (consequtively, not
* concurrently), whereas uprobe_mmap() is called while holding mmap_sem
* for the particular process doing the mmap.
*
* uprobe_register()->register_for_each_vma() needs to drop/acquire mmap_sem
* because of lock order against i_mmap_mutex. This means there's a hole in
* the register vma iteration where a mmap() can happen.
*
* Thus uprobe_register() can race with uprobe_mmap() and we can try and
* install a probe where one is already installed.
*/
/* serialize (un)register */
static struct mutex uprobes_mutex[UPROBES_HASH_SZ];
#define uprobes_hash(v) (&uprobes_mutex[((unsigned long)(v)) % UPROBES_HASH_SZ])
/* serialize uprobe->pending_list */
static struct mutex uprobes_mmap_mutex[UPROBES_HASH_SZ];
#define uprobes_mmap_hash(v) (&uprobes_mmap_mutex[((unsigned long)(v)) % UPROBES_HASH_SZ])
static struct percpu_rw_semaphore dup_mmap_sem;
/*
* uprobe_events allows us to skip the uprobe_mmap if there are no uprobe
* events active at this time. Probably a fine grained per inode count is
* better?
*/
static atomic_t uprobe_events = ATOMIC_INIT(0);
/* Have a copy of original instruction */
#define UPROBE_COPY_INSN 0
/* Dont run handlers when first register/ last unregister in progress*/
#define UPROBE_RUN_HANDLER 1
/* Can skip singlestep */
#define UPROBE_SKIP_SSTEP 2
#define UPROBE_SKIP_SSTEP 1
struct uprobe {
struct rb_node rb_node; /* node in the rb tree */
atomic_t ref;
struct rw_semaphore register_rwsem;
struct rw_semaphore consumer_rwsem;
struct mutex copy_mutex; /* TODO: kill me and UPROBE_COPY_INSN */
struct list_head pending_list;
struct uprobe_consumer *consumers;
struct inode *inode; /* Also hold a ref to inode */
@ -430,9 +404,6 @@ static struct uprobe *insert_uprobe(struct uprobe *uprobe)
u = __insert_uprobe(uprobe);
spin_unlock(&uprobes_treelock);
/* For now assume that the instruction need not be single-stepped */
__set_bit(UPROBE_SKIP_SSTEP, &uprobe->flags);
return u;
}
@ -452,8 +423,10 @@ static struct uprobe *alloc_uprobe(struct inode *inode, loff_t offset)
uprobe->inode = igrab(inode);
uprobe->offset = offset;
init_rwsem(&uprobe->register_rwsem);
init_rwsem(&uprobe->consumer_rwsem);
mutex_init(&uprobe->copy_mutex);
/* For now assume that the instruction need not be single-stepped */
__set_bit(UPROBE_SKIP_SSTEP, &uprobe->flags);
/* add to uprobes_tree, sorted on inode:offset */
cur_uprobe = insert_uprobe(uprobe);
@ -463,38 +436,17 @@ static struct uprobe *alloc_uprobe(struct inode *inode, loff_t offset)
kfree(uprobe);
uprobe = cur_uprobe;
iput(inode);
} else {
atomic_inc(&uprobe_events);
}
return uprobe;
}
static void handler_chain(struct uprobe *uprobe, struct pt_regs *regs)
{
struct uprobe_consumer *uc;
if (!test_bit(UPROBE_RUN_HANDLER, &uprobe->flags))
return;
down_read(&uprobe->consumer_rwsem);
for (uc = uprobe->consumers; uc; uc = uc->next) {
if (!uc->filter || uc->filter(uc, current))
uc->handler(uc, regs);
}
up_read(&uprobe->consumer_rwsem);
}
/* Returns the previous consumer */
static struct uprobe_consumer *
consumer_add(struct uprobe *uprobe, struct uprobe_consumer *uc)
static void consumer_add(struct uprobe *uprobe, struct uprobe_consumer *uc)
{
down_write(&uprobe->consumer_rwsem);
uc->next = uprobe->consumers;
uprobe->consumers = uc;
up_write(&uprobe->consumer_rwsem);
return uc->next;
}
/*
@ -588,7 +540,8 @@ static int prepare_uprobe(struct uprobe *uprobe, struct file *file,
if (test_bit(UPROBE_COPY_INSN, &uprobe->flags))
return ret;
mutex_lock(&uprobe->copy_mutex);
/* TODO: move this into _register, until then we abuse this sem. */
down_write(&uprobe->consumer_rwsem);
if (test_bit(UPROBE_COPY_INSN, &uprobe->flags))
goto out;
@ -612,7 +565,30 @@ static int prepare_uprobe(struct uprobe *uprobe, struct file *file,
set_bit(UPROBE_COPY_INSN, &uprobe->flags);
out:
mutex_unlock(&uprobe->copy_mutex);
up_write(&uprobe->consumer_rwsem);
return ret;
}
static inline bool consumer_filter(struct uprobe_consumer *uc,
enum uprobe_filter_ctx ctx, struct mm_struct *mm)
{
return !uc->filter || uc->filter(uc, ctx, mm);
}
static bool filter_chain(struct uprobe *uprobe,
enum uprobe_filter_ctx ctx, struct mm_struct *mm)
{
struct uprobe_consumer *uc;
bool ret = false;
down_read(&uprobe->consumer_rwsem);
for (uc = uprobe->consumers; uc; uc = uc->next) {
ret = consumer_filter(uc, ctx, mm);
if (ret)
break;
}
up_read(&uprobe->consumer_rwsem);
return ret;
}
@ -624,16 +600,6 @@ install_breakpoint(struct uprobe *uprobe, struct mm_struct *mm,
bool first_uprobe;
int ret;
/*
* If probe is being deleted, unregister thread could be done with
* the vma-rmap-walk through. Adding a probe now can be fatal since
* nobody will be able to cleanup. Also we could be from fork or
* mremap path, where the probe might have already been inserted.
* Hence behave as if probe already existed.
*/
if (!uprobe->consumers)
return 0;
ret = prepare_uprobe(uprobe, vma->vm_file, mm, vaddr);
if (ret)
return ret;
@ -658,14 +624,14 @@ install_breakpoint(struct uprobe *uprobe, struct mm_struct *mm,
static int
remove_breakpoint(struct uprobe *uprobe, struct mm_struct *mm, unsigned long vaddr)
{
/* can happen if uprobe_register() fails */
if (!test_bit(MMF_HAS_UPROBES, &mm->flags))
return 0;
set_bit(MMF_RECALC_UPROBES, &mm->flags);
return set_orig_insn(&uprobe->arch, mm, vaddr);
}
static inline bool uprobe_is_active(struct uprobe *uprobe)
{
return !RB_EMPTY_NODE(&uprobe->rb_node);
}
/*
* There could be threads that have already hit the breakpoint. They
* will recheck the current insn and restart if find_uprobe() fails.
@ -673,12 +639,15 @@ remove_breakpoint(struct uprobe *uprobe, struct mm_struct *mm, unsigned long vad
*/
static void delete_uprobe(struct uprobe *uprobe)
{
if (WARN_ON(!uprobe_is_active(uprobe)))
return;
spin_lock(&uprobes_treelock);
rb_erase(&uprobe->rb_node, &uprobes_tree);
spin_unlock(&uprobes_treelock);
RB_CLEAR_NODE(&uprobe->rb_node); /* for uprobe_is_active() */
iput(uprobe->inode);
put_uprobe(uprobe);
atomic_dec(&uprobe_events);
}
struct map_info {
@ -764,8 +733,10 @@ build_map_info(struct address_space *mapping, loff_t offset, bool is_register)
return curr;
}
static int register_for_each_vma(struct uprobe *uprobe, bool is_register)
static int
register_for_each_vma(struct uprobe *uprobe, struct uprobe_consumer *new)
{
bool is_register = !!new;
struct map_info *info;
int err = 0;
@ -794,10 +765,16 @@ static int register_for_each_vma(struct uprobe *uprobe, bool is_register)
vaddr_to_offset(vma, info->vaddr) != uprobe->offset)
goto unlock;
if (is_register)
if (is_register) {
/* consult only the "caller", new consumer. */
if (consumer_filter(new,
UPROBE_FILTER_REGISTER, mm))
err = install_breakpoint(uprobe, mm, vma, info->vaddr);
else
} else if (test_bit(MMF_HAS_UPROBES, &mm->flags)) {
if (!filter_chain(uprobe,
UPROBE_FILTER_UNREGISTER, mm))
err |= remove_breakpoint(uprobe, mm, info->vaddr);
}
unlock:
up_write(&mm->mmap_sem);
@ -810,17 +787,23 @@ static int register_for_each_vma(struct uprobe *uprobe, bool is_register)
return err;
}
static int __uprobe_register(struct uprobe *uprobe)
static int __uprobe_register(struct uprobe *uprobe, struct uprobe_consumer *uc)
{
return register_for_each_vma(uprobe, true);
consumer_add(uprobe, uc);
return register_for_each_vma(uprobe, uc);
}
static void __uprobe_unregister(struct uprobe *uprobe)
static void __uprobe_unregister(struct uprobe *uprobe, struct uprobe_consumer *uc)
{
if (!register_for_each_vma(uprobe, false))
delete_uprobe(uprobe);
int err;
if (!consumer_del(uprobe, uc)) /* WARN? */
return;
err = register_for_each_vma(uprobe, NULL);
/* TODO : cant unregister? schedule a worker thread */
if (!uprobe->consumers && !err)
delete_uprobe(uprobe);
}
/*
@ -845,30 +828,58 @@ int uprobe_register(struct inode *inode, loff_t offset, struct uprobe_consumer *
struct uprobe *uprobe;
int ret;
if (!inode || !uc || uc->next)
return -EINVAL;
/* Racy, just to catch the obvious mistakes */
if (offset > i_size_read(inode))
return -EINVAL;
ret = 0;
mutex_lock(uprobes_hash(inode));
retry:
uprobe = alloc_uprobe(inode, offset);
if (!uprobe) {
ret = -ENOMEM;
} else if (!consumer_add(uprobe, uc)) {
ret = __uprobe_register(uprobe);
if (ret) {
uprobe->consumers = NULL;
__uprobe_unregister(uprobe);
} else {
set_bit(UPROBE_RUN_HANDLER, &uprobe->flags);
}
if (!uprobe)
return -ENOMEM;
/*
* We can race with uprobe_unregister()->delete_uprobe().
* Check uprobe_is_active() and retry if it is false.
*/
down_write(&uprobe->register_rwsem);
ret = -EAGAIN;
if (likely(uprobe_is_active(uprobe))) {
ret = __uprobe_register(uprobe, uc);
if (ret)
__uprobe_unregister(uprobe, uc);
}
up_write(&uprobe->register_rwsem);
put_uprobe(uprobe);
mutex_unlock(uprobes_hash(inode));
if (uprobe)
if (unlikely(ret == -EAGAIN))
goto retry;
return ret;
}
EXPORT_SYMBOL_GPL(uprobe_register);
/*
* uprobe_apply - unregister a already registered probe.
* @inode: the file in which the probe has to be removed.
* @offset: offset from the start of the file.
* @uc: consumer which wants to add more or remove some breakpoints
* @add: add or remove the breakpoints
*/
int uprobe_apply(struct inode *inode, loff_t offset,
struct uprobe_consumer *uc, bool add)
{
struct uprobe *uprobe;
struct uprobe_consumer *con;
int ret = -ENOENT;
uprobe = find_uprobe(inode, offset);
if (!uprobe)
return ret;
down_write(&uprobe->register_rwsem);
for (con = uprobe->consumers; con && con != uc ; con = con->next)
;
if (con)
ret = register_for_each_vma(uprobe, add ? uc : NULL);
up_write(&uprobe->register_rwsem);
put_uprobe(uprobe);
return ret;
@ -884,25 +895,43 @@ void uprobe_unregister(struct inode *inode, loff_t offset, struct uprobe_consume
{
struct uprobe *uprobe;
if (!inode || !uc)
return;
uprobe = find_uprobe(inode, offset);
if (!uprobe)
return;
mutex_lock(uprobes_hash(inode));
if (consumer_del(uprobe, uc)) {
if (!uprobe->consumers) {
__uprobe_unregister(uprobe);
clear_bit(UPROBE_RUN_HANDLER, &uprobe->flags);
}
}
mutex_unlock(uprobes_hash(inode));
down_write(&uprobe->register_rwsem);
__uprobe_unregister(uprobe, uc);
up_write(&uprobe->register_rwsem);
put_uprobe(uprobe);
}
EXPORT_SYMBOL_GPL(uprobe_unregister);
static int unapply_uprobe(struct uprobe *uprobe, struct mm_struct *mm)
{
struct vm_area_struct *vma;
int err = 0;
down_read(&mm->mmap_sem);
for (vma = mm->mmap; vma; vma = vma->vm_next) {
unsigned long vaddr;
loff_t offset;
if (!valid_vma(vma, false) ||
vma->vm_file->f_mapping->host != uprobe->inode)
continue;
offset = (loff_t)vma->vm_pgoff << PAGE_SHIFT;
if (uprobe->offset < offset ||
uprobe->offset >= offset + vma->vm_end - vma->vm_start)
continue;
vaddr = offset_to_vaddr(vma, uprobe->offset);
err |= remove_breakpoint(uprobe, mm, vaddr);
}
up_read(&mm->mmap_sem);
return err;
}
static struct rb_node *
find_node_in_range(struct inode *inode, loff_t min, loff_t max)
@ -978,7 +1007,7 @@ int uprobe_mmap(struct vm_area_struct *vma)
struct uprobe *uprobe, *u;
struct inode *inode;
if (!atomic_read(&uprobe_events) || !valid_vma(vma, true))
if (no_uprobe_events() || !valid_vma(vma, true))
return 0;
inode = vma->vm_file->f_mapping->host;
@ -987,9 +1016,14 @@ int uprobe_mmap(struct vm_area_struct *vma)
mutex_lock(uprobes_mmap_hash(inode));
build_probe_list(inode, vma, vma->vm_start, vma->vm_end, &tmp_list);
/*
* We can race with uprobe_unregister(), this uprobe can be already
* removed. But in this case filter_chain() must return false, all
* consumers have gone away.
*/
list_for_each_entry_safe(uprobe, u, &tmp_list, pending_list) {
if (!fatal_signal_pending(current)) {
if (!fatal_signal_pending(current) &&
filter_chain(uprobe, UPROBE_FILTER_MMAP, vma->vm_mm)) {
unsigned long vaddr = offset_to_vaddr(vma, uprobe->offset);
install_breakpoint(uprobe, vma->vm_mm, vma, vaddr);
}
@ -1024,7 +1058,7 @@ vma_has_uprobes(struct vm_area_struct *vma, unsigned long start, unsigned long e
*/
void uprobe_munmap(struct vm_area_struct *vma, unsigned long start, unsigned long end)
{
if (!atomic_read(&uprobe_events) || !valid_vma(vma, false))
if (no_uprobe_events() || !valid_vma(vma, false))
return;
if (!atomic_read(&vma->vm_mm->mm_users)) /* called by mmput() ? */
@ -1041,22 +1075,14 @@ void uprobe_munmap(struct vm_area_struct *vma, unsigned long start, unsigned lon
/* Slot allocation for XOL */
static int xol_add_vma(struct xol_area *area)
{
struct mm_struct *mm;
int ret;
area->page = alloc_page(GFP_HIGHUSER);
if (!area->page)
return -ENOMEM;
ret = -EALREADY;
mm = current->mm;
struct mm_struct *mm = current->mm;
int ret = -EALREADY;
down_write(&mm->mmap_sem);
if (mm->uprobes_state.xol_area)
goto fail;
ret = -ENOMEM;
/* Try to map as high as possible, this is only a hint. */
area->vaddr = get_unmapped_area(NULL, TASK_SIZE - PAGE_SIZE, PAGE_SIZE, 0, 0);
if (area->vaddr & ~PAGE_MASK) {
@ -1072,54 +1098,53 @@ static int xol_add_vma(struct xol_area *area)
smp_wmb(); /* pairs with get_xol_area() */
mm->uprobes_state.xol_area = area;
ret = 0;
fail:
fail:
up_write(&mm->mmap_sem);
if (ret)
__free_page(area->page);
return ret;
}
static struct xol_area *get_xol_area(struct mm_struct *mm)
{
struct xol_area *area;
area = mm->uprobes_state.xol_area;
smp_read_barrier_depends(); /* pairs with wmb in xol_add_vma() */
return area;
}
/*
* xol_alloc_area - Allocate process's xol_area.
* This area will be used for storing instructions for execution out of
* line.
* get_xol_area - Allocate process's xol_area if necessary.
* This area will be used for storing instructions for execution out of line.
*
* Returns the allocated area or NULL.
*/
static struct xol_area *xol_alloc_area(void)
static struct xol_area *get_xol_area(void)
{
struct mm_struct *mm = current->mm;
struct xol_area *area;
area = mm->uprobes_state.xol_area;
if (area)
goto ret;
area = kzalloc(sizeof(*area), GFP_KERNEL);
if (unlikely(!area))
return NULL;
goto out;
area->bitmap = kzalloc(BITS_TO_LONGS(UINSNS_PER_PAGE) * sizeof(long), GFP_KERNEL);
if (!area->bitmap)
goto fail;
goto free_area;
area->page = alloc_page(GFP_HIGHUSER);
if (!area->page)
goto free_bitmap;
init_waitqueue_head(&area->wq);
if (!xol_add_vma(area))
return area;
fail:
__free_page(area->page);
free_bitmap:
kfree(area->bitmap);
free_area:
kfree(area);
return get_xol_area(current->mm);
out:
area = mm->uprobes_state.xol_area;
ret:
smp_read_barrier_depends(); /* pairs with wmb in xol_add_vma() */
return area;
}
/*
@ -1185,33 +1210,26 @@ static unsigned long xol_take_insn_slot(struct xol_area *area)
}
/*
* xol_get_insn_slot - If was not allocated a slot, then
* allocate a slot.
* xol_get_insn_slot - allocate a slot for xol.
* Returns the allocated slot address or 0.
*/
static unsigned long xol_get_insn_slot(struct uprobe *uprobe, unsigned long slot_addr)
static unsigned long xol_get_insn_slot(struct uprobe *uprobe)
{
struct xol_area *area;
unsigned long offset;
unsigned long xol_vaddr;
void *vaddr;
area = get_xol_area(current->mm);
if (!area) {
area = xol_alloc_area();
area = get_xol_area();
if (!area)
return 0;
}
current->utask->xol_vaddr = xol_take_insn_slot(area);
/*
* Initialize the slot if xol_vaddr points to valid
* instruction slot.
*/
if (unlikely(!current->utask->xol_vaddr))
xol_vaddr = xol_take_insn_slot(area);
if (unlikely(!xol_vaddr))
return 0;
current->utask->vaddr = slot_addr;
offset = current->utask->xol_vaddr & ~PAGE_MASK;
/* Initialize the slot */
offset = xol_vaddr & ~PAGE_MASK;
vaddr = kmap_atomic(area->page);
memcpy(vaddr + offset, uprobe->arch.insn, MAX_UINSN_BYTES);
kunmap_atomic(vaddr);
@ -1221,7 +1239,7 @@ static unsigned long xol_get_insn_slot(struct uprobe *uprobe, unsigned long slot
*/
flush_dcache_page(area->page);
return current->utask->xol_vaddr;
return xol_vaddr;
}
/*
@ -1239,8 +1257,7 @@ static void xol_free_insn_slot(struct task_struct *tsk)
return;
slot_addr = tsk->utask->xol_vaddr;
if (unlikely(!slot_addr || IS_ERR_VALUE(slot_addr)))
if (unlikely(!slot_addr))
return;
area = tsk->mm->uprobes_state.xol_area;
@ -1302,33 +1319,48 @@ void uprobe_copy_process(struct task_struct *t)
}
/*
* Allocate a uprobe_task object for the task.
* Called when the thread hits a breakpoint for the first time.
* Allocate a uprobe_task object for the task if if necessary.
* Called when the thread hits a breakpoint.
*
* Returns:
* - pointer to new uprobe_task on success
* - NULL otherwise
*/
static struct uprobe_task *add_utask(void)
static struct uprobe_task *get_utask(void)
{
struct uprobe_task *utask;
utask = kzalloc(sizeof *utask, GFP_KERNEL);
if (unlikely(!utask))
return NULL;
current->utask = utask;
return utask;
if (!current->utask)
current->utask = kzalloc(sizeof(struct uprobe_task), GFP_KERNEL);
return current->utask;
}
/* Prepare to single-step probed instruction out of line. */
static int
pre_ssout(struct uprobe *uprobe, struct pt_regs *regs, unsigned long vaddr)
pre_ssout(struct uprobe *uprobe, struct pt_regs *regs, unsigned long bp_vaddr)
{
if (xol_get_insn_slot(uprobe, vaddr) && !arch_uprobe_pre_xol(&uprobe->arch, regs))
return 0;
struct uprobe_task *utask;
unsigned long xol_vaddr;
int err;
return -EFAULT;
utask = get_utask();
if (!utask)
return -ENOMEM;
xol_vaddr = xol_get_insn_slot(uprobe);
if (!xol_vaddr)
return -ENOMEM;
utask->xol_vaddr = xol_vaddr;
utask->vaddr = bp_vaddr;
err = arch_uprobe_pre_xol(&uprobe->arch, regs);
if (unlikely(err)) {
xol_free_insn_slot(current);
return err;
}
utask->active_uprobe = uprobe;
utask->state = UTASK_SSTEP;
return 0;
}
/*
@ -1390,6 +1422,7 @@ static void mmf_recalc_uprobes(struct mm_struct *mm)
* This is not strictly accurate, we can race with
* uprobe_unregister() and see the already removed
* uprobe if delete_uprobe() was not yet called.
* Or this uprobe can be filtered out.
*/
if (vma_has_uprobes(vma, vma->vm_start, vma->vm_end))
return;
@ -1451,13 +1484,33 @@ static struct uprobe *find_active_uprobe(unsigned long bp_vaddr, int *is_swbp)
return uprobe;
}
static void handler_chain(struct uprobe *uprobe, struct pt_regs *regs)
{
struct uprobe_consumer *uc;
int remove = UPROBE_HANDLER_REMOVE;
down_read(&uprobe->register_rwsem);
for (uc = uprobe->consumers; uc; uc = uc->next) {
int rc = uc->handler(uc, regs);
WARN(rc & ~UPROBE_HANDLER_MASK,
"bad rc=0x%x from %pf()\n", rc, uc->handler);
remove &= rc;
}
if (remove && uprobe->consumers) {
WARN_ON(!uprobe_is_active(uprobe));
unapply_uprobe(uprobe, current->mm);
}
up_read(&uprobe->register_rwsem);
}
/*
* Run handler and ask thread to singlestep.
* Ensure all non-fatal signals cannot interrupt thread while it singlesteps.
*/
static void handle_swbp(struct pt_regs *regs)
{
struct uprobe_task *utask;
struct uprobe *uprobe;
unsigned long bp_vaddr;
int uninitialized_var(is_swbp);
@ -1482,6 +1535,10 @@ static void handle_swbp(struct pt_regs *regs)
}
return;
}
/* change it in advance for ->handler() and restart */
instruction_pointer_set(regs, bp_vaddr);
/*
* TODO: move copy_insn/etc into _register and remove this hack.
* After we hit the bp, _unregister + _register can install the
@ -1489,32 +1546,16 @@ static void handle_swbp(struct pt_regs *regs)
*/
smp_rmb(); /* pairs with wmb() in install_breakpoint() */
if (unlikely(!test_bit(UPROBE_COPY_INSN, &uprobe->flags)))
goto restart;
utask = current->utask;
if (!utask) {
utask = add_utask();
/* Cannot allocate; re-execute the instruction. */
if (!utask)
goto restart;
}
goto out;
handler_chain(uprobe, regs);
if (can_skip_sstep(uprobe, regs))
goto out;
if (!pre_ssout(uprobe, regs, bp_vaddr)) {
utask->active_uprobe = uprobe;
utask->state = UTASK_SSTEP;
if (!pre_ssout(uprobe, regs, bp_vaddr))
return;
}
restart:
/*
* cannot singlestep; cannot skip instruction;
* re-execute the instruction.
*/
instruction_pointer_set(regs, bp_vaddr);
/* can_skip_sstep() succeeded, or restart if can't singlestep */
out:
put_uprobe(uprobe);
}
@ -1608,10 +1649,8 @@ static int __init init_uprobes(void)
{
int i;
for (i = 0; i < UPROBES_HASH_SZ; i++) {
mutex_init(&uprobes_mutex[i]);
for (i = 0; i < UPROBES_HASH_SZ; i++)
mutex_init(&uprobes_mmap_mutex[i]);
}
if (percpu_init_rwsem(&dup_mmap_sem))
return -ENOMEM;

View File

@ -712,6 +712,12 @@ static int ptrace_regset(struct task_struct *task, int req, unsigned int type,
kiov->iov_len, kiov->iov_base);
}
/*
* This is declared in linux/regset.h and defined in machine-dependent
* code. We put the export here, near the primary machine-neutral use,
* to ensure no machine forgets it.
*/
EXPORT_SYMBOL_GPL(task_user_regset_view);
#endif
int ptrace_request(struct task_struct *child, long request,

View File

@ -66,7 +66,6 @@
#define TP_FLAG_TRACE 1
#define TP_FLAG_PROFILE 2
#define TP_FLAG_REGISTERED 4
#define TP_FLAG_UPROBE 8
/* data_rloc: data relative location, compatible with u32 */

View File

@ -28,20 +28,21 @@
#define UPROBE_EVENT_SYSTEM "uprobes"
struct trace_uprobe_filter {
rwlock_t rwlock;
int nr_systemwide;
struct list_head perf_events;
};
/*
* uprobe event core functions
*/
struct trace_uprobe;
struct uprobe_trace_consumer {
struct uprobe_consumer cons;
struct trace_uprobe *tu;
};
struct trace_uprobe {
struct list_head list;
struct ftrace_event_class class;
struct ftrace_event_call call;
struct uprobe_trace_consumer *consumer;
struct trace_uprobe_filter filter;
struct uprobe_consumer consumer;
struct inode *inode;
char *filename;
unsigned long offset;
@ -64,6 +65,18 @@ static LIST_HEAD(uprobe_list);
static int uprobe_dispatcher(struct uprobe_consumer *con, struct pt_regs *regs);
static inline void init_trace_uprobe_filter(struct trace_uprobe_filter *filter)
{
rwlock_init(&filter->rwlock);
filter->nr_systemwide = 0;
INIT_LIST_HEAD(&filter->perf_events);
}
static inline bool uprobe_filter_is_empty(struct trace_uprobe_filter *filter)
{
return !filter->nr_systemwide && list_empty(&filter->perf_events);
}
/*
* Allocate new trace_uprobe and initialize it (including uprobes).
*/
@ -92,6 +105,8 @@ alloc_trace_uprobe(const char *group, const char *event, int nargs)
goto error;
INIT_LIST_HEAD(&tu->list);
tu->consumer.handler = uprobe_dispatcher;
init_trace_uprobe_filter(&tu->filter);
return tu;
error:
@ -253,16 +268,18 @@ static int create_trace_uprobe(int argc, char **argv)
if (ret)
goto fail_address_parse;
ret = kstrtoul(arg, 0, &offset);
if (ret)
goto fail_address_parse;
inode = igrab(path.dentry->d_inode);
if (!S_ISREG(inode->i_mode)) {
path_put(&path);
if (!inode || !S_ISREG(inode->i_mode)) {
ret = -EINVAL;
goto fail_address_parse;
}
ret = kstrtoul(arg, 0, &offset);
if (ret)
goto fail_address_parse;
argc -= 2;
argv += 2;
@ -469,7 +486,7 @@ static const struct file_operations uprobe_profile_ops = {
};
/* uprobe handler */
static void uprobe_trace_func(struct trace_uprobe *tu, struct pt_regs *regs)
static int uprobe_trace_func(struct trace_uprobe *tu, struct pt_regs *regs)
{
struct uprobe_trace_entry_head *entry;
struct ring_buffer_event *event;
@ -479,8 +496,6 @@ static void uprobe_trace_func(struct trace_uprobe *tu, struct pt_regs *regs)
unsigned long irq_flags;
struct ftrace_event_call *call = &tu->call;
tu->nhit++;
local_save_flags(irq_flags);
pc = preempt_count();
@ -489,16 +504,18 @@ static void uprobe_trace_func(struct trace_uprobe *tu, struct pt_regs *regs)
event = trace_current_buffer_lock_reserve(&buffer, call->event.type,
size, irq_flags, pc);
if (!event)
return;
return 0;
entry = ring_buffer_event_data(event);
entry->ip = uprobe_get_swbp_addr(task_pt_regs(current));
entry->ip = instruction_pointer(task_pt_regs(current));
data = (u8 *)&entry[1];
for (i = 0; i < tu->nr_args; i++)
call_fetch(&tu->args[i].fetch, regs, data + tu->args[i].offset);
if (!filter_current_check_discard(buffer, call, entry, event))
trace_buffer_unlock_commit(buffer, event, irq_flags, pc);
return 0;
}
/* Event entry printers */
@ -537,42 +554,43 @@ partial:
return TRACE_TYPE_PARTIAL_LINE;
}
static int probe_event_enable(struct trace_uprobe *tu, int flag)
static inline bool is_trace_uprobe_enabled(struct trace_uprobe *tu)
{
return tu->flags & (TP_FLAG_TRACE | TP_FLAG_PROFILE);
}
typedef bool (*filter_func_t)(struct uprobe_consumer *self,
enum uprobe_filter_ctx ctx,
struct mm_struct *mm);
static int
probe_event_enable(struct trace_uprobe *tu, int flag, filter_func_t filter)
{
struct uprobe_trace_consumer *utc;
int ret = 0;
if (!tu->inode || tu->consumer)
if (is_trace_uprobe_enabled(tu))
return -EINTR;
utc = kzalloc(sizeof(struct uprobe_trace_consumer), GFP_KERNEL);
if (!utc)
return -EINTR;
utc->cons.handler = uprobe_dispatcher;
utc->cons.filter = NULL;
ret = uprobe_register(tu->inode, tu->offset, &utc->cons);
if (ret) {
kfree(utc);
return ret;
}
WARN_ON(!uprobe_filter_is_empty(&tu->filter));
tu->flags |= flag;
utc->tu = tu;
tu->consumer = utc;
tu->consumer.filter = filter;
ret = uprobe_register(tu->inode, tu->offset, &tu->consumer);
if (ret)
tu->flags &= ~flag;
return 0;
return ret;
}
static void probe_event_disable(struct trace_uprobe *tu, int flag)
{
if (!tu->inode || !tu->consumer)
if (!is_trace_uprobe_enabled(tu))
return;
uprobe_unregister(tu->inode, tu->offset, &tu->consumer->cons);
WARN_ON(!uprobe_filter_is_empty(&tu->filter));
uprobe_unregister(tu->inode, tu->offset, &tu->consumer);
tu->flags &= ~flag;
kfree(tu->consumer);
tu->consumer = NULL;
}
static int uprobe_event_define_fields(struct ftrace_event_call *event_call)
@ -646,8 +664,96 @@ static int set_print_fmt(struct trace_uprobe *tu)
}
#ifdef CONFIG_PERF_EVENTS
static bool
__uprobe_perf_filter(struct trace_uprobe_filter *filter, struct mm_struct *mm)
{
struct perf_event *event;
if (filter->nr_systemwide)
return true;
list_for_each_entry(event, &filter->perf_events, hw.tp_list) {
if (event->hw.tp_target->mm == mm)
return true;
}
return false;
}
static inline bool
uprobe_filter_event(struct trace_uprobe *tu, struct perf_event *event)
{
return __uprobe_perf_filter(&tu->filter, event->hw.tp_target->mm);
}
static int uprobe_perf_open(struct trace_uprobe *tu, struct perf_event *event)
{
bool done;
write_lock(&tu->filter.rwlock);
if (event->hw.tp_target) {
/*
* event->parent != NULL means copy_process(), we can avoid
* uprobe_apply(). current->mm must be probed and we can rely
* on dup_mmap() which preserves the already installed bp's.
*
* attr.enable_on_exec means that exec/mmap will install the
* breakpoints we need.
*/
done = tu->filter.nr_systemwide ||
event->parent || event->attr.enable_on_exec ||
uprobe_filter_event(tu, event);
list_add(&event->hw.tp_list, &tu->filter.perf_events);
} else {
done = tu->filter.nr_systemwide;
tu->filter.nr_systemwide++;
}
write_unlock(&tu->filter.rwlock);
if (!done)
uprobe_apply(tu->inode, tu->offset, &tu->consumer, true);
return 0;
}
static int uprobe_perf_close(struct trace_uprobe *tu, struct perf_event *event)
{
bool done;
write_lock(&tu->filter.rwlock);
if (event->hw.tp_target) {
list_del(&event->hw.tp_list);
done = tu->filter.nr_systemwide ||
(event->hw.tp_target->flags & PF_EXITING) ||
uprobe_filter_event(tu, event);
} else {
tu->filter.nr_systemwide--;
done = tu->filter.nr_systemwide;
}
write_unlock(&tu->filter.rwlock);
if (!done)
uprobe_apply(tu->inode, tu->offset, &tu->consumer, false);
return 0;
}
static bool uprobe_perf_filter(struct uprobe_consumer *uc,
enum uprobe_filter_ctx ctx, struct mm_struct *mm)
{
struct trace_uprobe *tu;
int ret;
tu = container_of(uc, struct trace_uprobe, consumer);
read_lock(&tu->filter.rwlock);
ret = __uprobe_perf_filter(&tu->filter, mm);
read_unlock(&tu->filter.rwlock);
return ret;
}
/* uprobe profile handler */
static void uprobe_perf_func(struct trace_uprobe *tu, struct pt_regs *regs)
static int uprobe_perf_func(struct trace_uprobe *tu, struct pt_regs *regs)
{
struct ftrace_event_call *call = &tu->call;
struct uprobe_trace_entry_head *entry;
@ -656,11 +762,14 @@ static void uprobe_perf_func(struct trace_uprobe *tu, struct pt_regs *regs)
int size, __size, i;
int rctx;
if (!uprobe_perf_filter(&tu->consumer, 0, current->mm))
return UPROBE_HANDLER_REMOVE;
__size = sizeof(*entry) + tu->size;
size = ALIGN(__size + sizeof(u32), sizeof(u64));
size -= sizeof(u32);
if (WARN_ONCE(size > PERF_MAX_TRACE_SIZE, "profile buffer not large enough"))
return;
return 0;
preempt_disable();
@ -668,7 +777,7 @@ static void uprobe_perf_func(struct trace_uprobe *tu, struct pt_regs *regs)
if (!entry)
goto out;
entry->ip = uprobe_get_swbp_addr(task_pt_regs(current));
entry->ip = instruction_pointer(task_pt_regs(current));
data = (u8 *)&entry[1];
for (i = 0; i < tu->nr_args; i++)
call_fetch(&tu->args[i].fetch, regs, data + tu->args[i].offset);
@ -678,6 +787,7 @@ static void uprobe_perf_func(struct trace_uprobe *tu, struct pt_regs *regs)
out:
preempt_enable();
return 0;
}
#endif /* CONFIG_PERF_EVENTS */
@ -688,7 +798,7 @@ int trace_uprobe_register(struct ftrace_event_call *event, enum trace_reg type,
switch (type) {
case TRACE_REG_REGISTER:
return probe_event_enable(tu, TP_FLAG_TRACE);
return probe_event_enable(tu, TP_FLAG_TRACE, NULL);
case TRACE_REG_UNREGISTER:
probe_event_disable(tu, TP_FLAG_TRACE);
@ -696,11 +806,18 @@ int trace_uprobe_register(struct ftrace_event_call *event, enum trace_reg type,
#ifdef CONFIG_PERF_EVENTS
case TRACE_REG_PERF_REGISTER:
return probe_event_enable(tu, TP_FLAG_PROFILE);
return probe_event_enable(tu, TP_FLAG_PROFILE, uprobe_perf_filter);
case TRACE_REG_PERF_UNREGISTER:
probe_event_disable(tu, TP_FLAG_PROFILE);
return 0;
case TRACE_REG_PERF_OPEN:
return uprobe_perf_open(tu, data);
case TRACE_REG_PERF_CLOSE:
return uprobe_perf_close(tu, data);
#endif
default:
return 0;
@ -710,22 +827,20 @@ int trace_uprobe_register(struct ftrace_event_call *event, enum trace_reg type,
static int uprobe_dispatcher(struct uprobe_consumer *con, struct pt_regs *regs)
{
struct uprobe_trace_consumer *utc;
struct trace_uprobe *tu;
int ret = 0;
utc = container_of(con, struct uprobe_trace_consumer, cons);
tu = utc->tu;
if (!tu || tu->consumer != utc)
return 0;
tu = container_of(con, struct trace_uprobe, consumer);
tu->nhit++;
if (tu->flags & TP_FLAG_TRACE)
uprobe_trace_func(tu, regs);
ret |= uprobe_trace_func(tu, regs);
#ifdef CONFIG_PERF_EVENTS
if (tu->flags & TP_FLAG_PROFILE)
uprobe_perf_func(tu, regs);
ret |= uprobe_perf_func(tu, regs);
#endif
return 0;
return ret;
}
static struct trace_event_functions uprobe_funcs = {