2009-04-08 18:40:59 +00:00
|
|
|
#include <trace/syscall.h>
|
2009-08-24 21:43:14 +00:00
|
|
|
#include <trace/events/syscalls.h>
|
2009-03-07 04:52:59 +00:00
|
|
|
#include <linux/kernel.h>
|
2009-08-10 20:52:47 +00:00
|
|
|
#include <linux/ftrace.h>
|
perf: Do the big rename: Performance Counters -> Performance Events
Bye-bye Performance Counters, welcome Performance Events!
In the past few months the perfcounters subsystem has grown out its
initial role of counting hardware events, and has become (and is
becoming) a much broader generic event enumeration, reporting, logging,
monitoring, analysis facility.
Naming its core object 'perf_counter' and naming the subsystem
'perfcounters' has become more and more of a misnomer. With pending
code like hw-breakpoints support the 'counter' name is less and
less appropriate.
All in one, we've decided to rename the subsystem to 'performance
events' and to propagate this rename through all fields, variables
and API names. (in an ABI compatible fashion)
The word 'event' is also a bit shorter than 'counter' - which makes
it slightly more convenient to write/handle as well.
Thanks goes to Stephane Eranian who first observed this misnomer and
suggested a rename.
User-space tooling and ABI compatibility is not affected - this patch
should be function-invariant. (Also, defconfigs were not touched to
keep the size down.)
This patch has been generated via the following script:
FILES=$(find * -type f | grep -vE 'oprofile|[^K]config')
sed -i \
-e 's/PERF_EVENT_/PERF_RECORD_/g' \
-e 's/PERF_COUNTER/PERF_EVENT/g' \
-e 's/perf_counter/perf_event/g' \
-e 's/nb_counters/nb_events/g' \
-e 's/swcounter/swevent/g' \
-e 's/tpcounter_event/tp_event/g' \
$FILES
for N in $(find . -name perf_counter.[ch]); do
M=$(echo $N | sed 's/perf_counter/perf_event/g')
mv $N $M
done
FILES=$(find . -name perf_event.*)
sed -i \
-e 's/COUNTER_MASK/REG_MASK/g' \
-e 's/COUNTER/EVENT/g' \
-e 's/\<event\>/event_id/g' \
-e 's/counter/event/g' \
-e 's/Counter/Event/g' \
$FILES
... to keep it as correct as possible. This script can also be
used by anyone who has pending perfcounters patches - it converts
a Linux kernel tree over to the new naming. We tried to time this
change to the point in time where the amount of pending patches
is the smallest: the end of the merge window.
Namespace clashes were fixed up in a preparatory patch - and some
stylistic fallout will be fixed up in a subsequent patch.
( NOTE: 'counters' are still the proper terminology when we deal
with hardware registers - and these sed scripts are a bit
over-eager in renaming them. I've undone some of that, but
in case there's something left where 'counter' would be
better than 'event' we can undo that on an individual basis
instead of touching an otherwise nicely automated patch. )
Suggested-by: Stephane Eranian <eranian@google.com>
Acked-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Acked-by: Paul Mackerras <paulus@samba.org>
Reviewed-by: Arjan van de Ven <arjan@linux.intel.com>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: David Howells <dhowells@redhat.com>
Cc: Kyle McMartin <kyle@mcmartin.ca>
Cc: Martin Schwidefsky <schwidefsky@de.ibm.com>
Cc: "David S. Miller" <davem@davemloft.net>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Cc: <linux-arch@vger.kernel.org>
LKML-Reference: <new-submission>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
2009-09-21 10:02:48 +00:00
|
|
|
#include <linux/perf_event.h>
|
2009-03-07 04:52:59 +00:00
|
|
|
#include <asm/syscall.h>
|
|
|
|
|
|
|
|
#include "trace_output.h"
|
|
|
|
#include "trace.h"
|
|
|
|
|
2009-03-15 21:10:37 +00:00
|
|
|
static DEFINE_MUTEX(syscall_trace_lock);
|
2009-08-10 20:52:47 +00:00
|
|
|
static int sys_refcount_enter;
|
|
|
|
static int sys_refcount_exit;
|
2009-08-24 21:40:22 +00:00
|
|
|
static DECLARE_BITMAP(enabled_enter_syscalls, NR_syscalls);
|
|
|
|
static DECLARE_BITMAP(enabled_exit_syscalls, NR_syscalls);
|
2009-03-07 04:52:59 +00:00
|
|
|
|
2009-09-19 04:50:42 +00:00
|
|
|
extern unsigned long __start_syscalls_metadata[];
|
|
|
|
extern unsigned long __stop_syscalls_metadata[];
|
|
|
|
|
|
|
|
static struct syscall_metadata **syscalls_metadata;
|
|
|
|
|
|
|
|
static struct syscall_metadata *find_syscall_meta(unsigned long syscall)
|
|
|
|
{
|
|
|
|
struct syscall_metadata *start;
|
|
|
|
struct syscall_metadata *stop;
|
|
|
|
char str[KSYM_SYMBOL_LEN];
|
|
|
|
|
|
|
|
|
|
|
|
start = (struct syscall_metadata *)__start_syscalls_metadata;
|
|
|
|
stop = (struct syscall_metadata *)__stop_syscalls_metadata;
|
|
|
|
kallsyms_lookup(syscall, NULL, NULL, NULL, str);
|
|
|
|
|
|
|
|
for ( ; start < stop; start++) {
|
|
|
|
/*
|
|
|
|
* Only compare after the "sys" prefix. Archs that use
|
|
|
|
* syscall wrappers may have syscalls symbols aliases prefixed
|
|
|
|
* with "SyS" instead of "sys", leading to an unwanted
|
|
|
|
* mismatch.
|
|
|
|
*/
|
|
|
|
if (start->name && !strcmp(start->name + 3, str + 3))
|
|
|
|
return start;
|
|
|
|
}
|
|
|
|
return NULL;
|
|
|
|
}
|
|
|
|
|
|
|
|
static struct syscall_metadata *syscall_nr_to_meta(int nr)
|
|
|
|
{
|
|
|
|
if (!syscalls_metadata || nr >= NR_syscalls || nr < 0)
|
|
|
|
return NULL;
|
|
|
|
|
|
|
|
return syscalls_metadata[nr];
|
|
|
|
}
|
|
|
|
|
2009-03-13 14:42:11 +00:00
|
|
|
enum print_line_t
|
|
|
|
print_syscall_enter(struct trace_iterator *iter, int flags)
|
|
|
|
{
|
|
|
|
struct trace_seq *s = &iter->seq;
|
|
|
|
struct trace_entry *ent = iter->ent;
|
|
|
|
struct syscall_trace_enter *trace;
|
|
|
|
struct syscall_metadata *entry;
|
|
|
|
int i, ret, syscall;
|
|
|
|
|
2009-08-10 20:52:53 +00:00
|
|
|
trace = (typeof(trace))ent;
|
2009-03-13 14:42:11 +00:00
|
|
|
syscall = trace->nr;
|
|
|
|
entry = syscall_nr_to_meta(syscall);
|
2009-08-10 20:52:53 +00:00
|
|
|
|
2009-03-13 14:42:11 +00:00
|
|
|
if (!entry)
|
|
|
|
goto end;
|
|
|
|
|
2009-12-01 08:23:36 +00:00
|
|
|
if (entry->enter_event->id != ent->type) {
|
2009-08-10 20:52:53 +00:00
|
|
|
WARN_ON_ONCE(1);
|
|
|
|
goto end;
|
|
|
|
}
|
|
|
|
|
2009-03-13 14:42:11 +00:00
|
|
|
ret = trace_seq_printf(s, "%s(", entry->name);
|
|
|
|
if (!ret)
|
|
|
|
return TRACE_TYPE_PARTIAL_LINE;
|
|
|
|
|
|
|
|
for (i = 0; i < entry->nb_args; i++) {
|
|
|
|
/* parameter types */
|
2009-08-17 08:55:18 +00:00
|
|
|
if (trace_flags & TRACE_ITER_VERBOSE) {
|
2009-03-13 14:42:11 +00:00
|
|
|
ret = trace_seq_printf(s, "%s ", entry->types[i]);
|
|
|
|
if (!ret)
|
|
|
|
return TRACE_TYPE_PARTIAL_LINE;
|
|
|
|
}
|
|
|
|
/* parameter values */
|
2009-08-20 08:13:35 +00:00
|
|
|
ret = trace_seq_printf(s, "%s: %lx%s", entry->args[i],
|
2009-03-13 14:42:11 +00:00
|
|
|
trace->args[i],
|
2009-08-20 08:13:35 +00:00
|
|
|
i == entry->nb_args - 1 ? "" : ", ");
|
2009-03-13 14:42:11 +00:00
|
|
|
if (!ret)
|
|
|
|
return TRACE_TYPE_PARTIAL_LINE;
|
|
|
|
}
|
|
|
|
|
2009-08-20 08:13:35 +00:00
|
|
|
ret = trace_seq_putc(s, ')');
|
|
|
|
if (!ret)
|
|
|
|
return TRACE_TYPE_PARTIAL_LINE;
|
|
|
|
|
2009-03-13 14:42:11 +00:00
|
|
|
end:
|
2009-08-20 08:13:35 +00:00
|
|
|
ret = trace_seq_putc(s, '\n');
|
|
|
|
if (!ret)
|
|
|
|
return TRACE_TYPE_PARTIAL_LINE;
|
|
|
|
|
2009-03-13 14:42:11 +00:00
|
|
|
return TRACE_TYPE_HANDLED;
|
|
|
|
}
|
|
|
|
|
|
|
|
enum print_line_t
|
|
|
|
print_syscall_exit(struct trace_iterator *iter, int flags)
|
|
|
|
{
|
|
|
|
struct trace_seq *s = &iter->seq;
|
|
|
|
struct trace_entry *ent = iter->ent;
|
|
|
|
struct syscall_trace_exit *trace;
|
|
|
|
int syscall;
|
|
|
|
struct syscall_metadata *entry;
|
|
|
|
int ret;
|
|
|
|
|
2009-08-10 20:52:53 +00:00
|
|
|
trace = (typeof(trace))ent;
|
2009-03-13 14:42:11 +00:00
|
|
|
syscall = trace->nr;
|
|
|
|
entry = syscall_nr_to_meta(syscall);
|
2009-08-10 20:52:53 +00:00
|
|
|
|
2009-03-13 14:42:11 +00:00
|
|
|
if (!entry) {
|
|
|
|
trace_seq_printf(s, "\n");
|
|
|
|
return TRACE_TYPE_HANDLED;
|
|
|
|
}
|
|
|
|
|
2009-12-01 08:23:36 +00:00
|
|
|
if (entry->exit_event->id != ent->type) {
|
2009-08-10 20:52:53 +00:00
|
|
|
WARN_ON_ONCE(1);
|
|
|
|
return TRACE_TYPE_UNHANDLED;
|
|
|
|
}
|
|
|
|
|
2009-03-13 14:42:11 +00:00
|
|
|
ret = trace_seq_printf(s, "%s -> 0x%lx\n", entry->name,
|
|
|
|
trace->ret);
|
|
|
|
if (!ret)
|
|
|
|
return TRACE_TYPE_PARTIAL_LINE;
|
|
|
|
|
|
|
|
return TRACE_TYPE_HANDLED;
|
|
|
|
}
|
|
|
|
|
2009-08-19 07:52:25 +00:00
|
|
|
extern char *__bad_type_size(void);
|
|
|
|
|
|
|
|
#define SYSCALL_FIELD(type, name) \
|
|
|
|
sizeof(type) != sizeof(trace.name) ? \
|
|
|
|
__bad_type_size() : \
|
2009-10-06 06:09:50 +00:00
|
|
|
#type, #name, offsetof(typeof(trace), name), \
|
|
|
|
sizeof(trace.name), is_signed_type(type)
|
2009-08-19 07:52:25 +00:00
|
|
|
|
2009-12-15 07:39:45 +00:00
|
|
|
static
|
|
|
|
int __set_enter_print_fmt(struct syscall_metadata *entry, char *buf, int len)
|
|
|
|
{
|
|
|
|
int i;
|
|
|
|
int pos = 0;
|
|
|
|
|
|
|
|
/* When len=0, we just calculate the needed length */
|
|
|
|
#define LEN_OR_ZERO (len ? len - pos : 0)
|
|
|
|
|
|
|
|
pos += snprintf(buf + pos, LEN_OR_ZERO, "\"");
|
|
|
|
for (i = 0; i < entry->nb_args; i++) {
|
|
|
|
pos += snprintf(buf + pos, LEN_OR_ZERO, "%s: 0x%%0%zulx%s",
|
|
|
|
entry->args[i], sizeof(unsigned long),
|
|
|
|
i == entry->nb_args - 1 ? "" : ", ");
|
|
|
|
}
|
|
|
|
pos += snprintf(buf + pos, LEN_OR_ZERO, "\"");
|
|
|
|
|
|
|
|
for (i = 0; i < entry->nb_args; i++) {
|
|
|
|
pos += snprintf(buf + pos, LEN_OR_ZERO,
|
|
|
|
", ((unsigned long)(REC->%s))", entry->args[i]);
|
|
|
|
}
|
|
|
|
|
|
|
|
#undef LEN_OR_ZERO
|
|
|
|
|
|
|
|
/* return the length of print_fmt */
|
|
|
|
return pos;
|
|
|
|
}
|
|
|
|
|
|
|
|
static int set_syscall_print_fmt(struct ftrace_event_call *call)
|
|
|
|
{
|
|
|
|
char *print_fmt;
|
|
|
|
int len;
|
|
|
|
struct syscall_metadata *entry = call->data;
|
|
|
|
|
|
|
|
if (entry->enter_event != call) {
|
|
|
|
call->print_fmt = "\"0x%lx\", REC->ret";
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
/* First: called with 0 length to calculate the needed length */
|
|
|
|
len = __set_enter_print_fmt(entry, NULL, 0);
|
|
|
|
|
|
|
|
print_fmt = kmalloc(len + 1, GFP_KERNEL);
|
|
|
|
if (!print_fmt)
|
|
|
|
return -ENOMEM;
|
|
|
|
|
|
|
|
/* Second: actually write the @print_fmt */
|
|
|
|
__set_enter_print_fmt(entry, print_fmt, len + 1);
|
|
|
|
call->print_fmt = print_fmt;
|
|
|
|
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
static void free_syscall_print_fmt(struct ftrace_event_call *call)
|
|
|
|
{
|
|
|
|
struct syscall_metadata *entry = call->data;
|
|
|
|
|
|
|
|
if (entry->enter_event == call)
|
|
|
|
kfree(call->print_fmt);
|
|
|
|
}
|
|
|
|
|
2009-08-19 07:54:51 +00:00
|
|
|
int syscall_enter_define_fields(struct ftrace_event_call *call)
|
|
|
|
{
|
|
|
|
struct syscall_trace_enter trace;
|
2009-12-01 08:23:30 +00:00
|
|
|
struct syscall_metadata *meta = call->data;
|
2009-08-19 07:54:51 +00:00
|
|
|
int ret;
|
|
|
|
int i;
|
|
|
|
int offset = offsetof(typeof(trace), args);
|
|
|
|
|
2009-11-26 07:49:33 +00:00
|
|
|
ret = trace_define_field(call, SYSCALL_FIELD(int, nr), FILTER_OTHER);
|
|
|
|
if (ret)
|
|
|
|
return ret;
|
|
|
|
|
2009-08-19 07:54:51 +00:00
|
|
|
for (i = 0; i < meta->nb_args; i++) {
|
2009-08-27 03:09:51 +00:00
|
|
|
ret = trace_define_field(call, meta->types[i],
|
|
|
|
meta->args[i], offset,
|
2009-08-07 02:33:22 +00:00
|
|
|
sizeof(unsigned long), 0,
|
|
|
|
FILTER_OTHER);
|
2009-08-19 07:54:51 +00:00
|
|
|
offset += sizeof(unsigned long);
|
|
|
|
}
|
|
|
|
|
|
|
|
return ret;
|
|
|
|
}
|
|
|
|
|
|
|
|
int syscall_exit_define_fields(struct ftrace_event_call *call)
|
|
|
|
{
|
|
|
|
struct syscall_trace_exit trace;
|
|
|
|
int ret;
|
|
|
|
|
2009-11-26 07:49:33 +00:00
|
|
|
ret = trace_define_field(call, SYSCALL_FIELD(int, nr), FILTER_OTHER);
|
|
|
|
if (ret)
|
|
|
|
return ret;
|
|
|
|
|
2009-10-06 06:09:50 +00:00
|
|
|
ret = trace_define_field(call, SYSCALL_FIELD(long, ret),
|
2009-08-07 02:33:22 +00:00
|
|
|
FILTER_OTHER);
|
2009-08-19 07:54:51 +00:00
|
|
|
|
|
|
|
return ret;
|
|
|
|
}
|
|
|
|
|
2009-08-10 20:52:47 +00:00
|
|
|
void ftrace_syscall_enter(struct pt_regs *regs, long id)
|
2009-03-07 04:52:59 +00:00
|
|
|
{
|
2009-03-13 14:42:11 +00:00
|
|
|
struct syscall_trace_enter *entry;
|
|
|
|
struct syscall_metadata *sys_data;
|
|
|
|
struct ring_buffer_event *event;
|
2009-09-02 18:17:06 +00:00
|
|
|
struct ring_buffer *buffer;
|
2009-03-13 14:42:11 +00:00
|
|
|
int size;
|
2009-03-07 04:52:59 +00:00
|
|
|
int syscall_nr;
|
|
|
|
|
|
|
|
syscall_nr = syscall_get_nr(current, regs);
|
tracing: Check invalid syscall nr while tracing syscalls
Most arch syscall_get_nr() implementations returns -1 if the syscall
number is not valid. Accessing the bit field without a check might
result in a kernel oops (at least I saw it on s390 for ftrace selftest).
Before this change, this problem did not occur, because the invalid
syscall number (-1) caused syscall_nr_to_meta() to return NULL.
There are at least two scenarios where syscall_get_nr() can return -1:
1. For example, ptrace stores an invalid syscall number, and thus,
tracing code resets it.
(see do_syscall_trace_enter in arch/s390/kernel/ptrace.c)
2. The syscall_regfunc() (kernel/tracepoint.c) sets the
TIF_SYSCALL_FTRACE (now: TIF_SYSCALL_TRACEPOINT) flag for all threads
which include kernel threads.
However, the ftrace selftest triggers a kernel oops when testing
syscall trace points:
- The kernel thread is started as ususal (do_fork()),
- tracing code sets TIF_SYSCALL_FTRACE,
- the ret_from_fork() function is triggered and starts
ftrace_syscall_exit() with an invalid syscall number.
To avoid these scenarios, I suggest to check the syscall_nr.
For instance, the ftrace selftest fails for s390 (with config option
CONFIG_FTRACE_SYSCALLS set) and produces the following kernel oops.
Unable to handle kernel pointer dereference at virtual kernel address 2000000000
Oops: 0038 [#1] PREEMPT SMP
Modules linked in:
CPU: 0 Not tainted 2.6.31-rc6-next-20090819-dirty #18
Process kthreadd (pid: 818, task: 000000003ea207e8, ksp: 000000003e813eb8)
Krnl PSW : 0704100180000000 00000000000ea54c (ftrace_syscall_exit+0x58/0xdc)
R:0 T:1 IO:1 EX:1 Key:0 M:1 W:0 P:0 AS:0 CC:1 PM:0 EA:3
Krnl GPRS: 0000000000000000 00000000000e0000 ffffffffffffffff 20000000008c2650
0000000000000007 0000000000000000 0000000000000000 0000000000000000
0000000000000000 0000000000000000 ffffffffffffffff 000000003e813d78
000000003e813f58 0000000000505ba8 000000003e813e18 000000003e813d78
Krnl Code: 00000000000ea540: e330d0000008 ag %r3,0(%r13)
00000000000ea546: a7480007 lhi %r4,7
00000000000ea54a: 1442 nr %r4,%r2
>00000000000ea54c: e31030000090 llgc %r1,0(%r3)
00000000000ea552: 5410d008 n %r1,8(%r13)
00000000000ea556: 8a104000 sra %r1,0(%r4)
00000000000ea55a: 5410d00c n %r1,12(%r13)
00000000000ea55e: 1211 ltr %r1,%r1
Call Trace:
([<0000000000000000>] 0x0)
[<000000000001fa22>] do_syscall_trace_exit+0x132/0x18c
[<000000000002d0c4>] sysc_return+0x0/0x8
[<000000000001c738>] kernel_thread_starter+0x0/0xc
Last Breaking-Event-Address:
[<00000000000ea51e>] ftrace_syscall_exit+0x2a/0xdc
Signed-off-by: Hendrik Brueckner <brueckner@linux.vnet.ibm.com>
Acked-by: Heiko Carstens <heiko.carstens@de.ibm.com>
Cc: Jason Baron <jbaron@redhat.com>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: Lai Jiangshan <laijs@cn.fujitsu.com>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Mathieu Desnoyers <mathieu.desnoyers@polymtl.ca>
Cc: Jiaying Zhang <jiayingz@google.com>
Cc: Martin Bligh <mbligh@google.com>
Cc: Li Zefan <lizf@cn.fujitsu.com>
Cc: Martin Schwidefsky <schwidefsky@de.ibm.com>
Cc: Paul Mundt <lethal@linux-sh.org>
LKML-Reference: <20090825125027.GE4639@cetus.boeblingen.de.ibm.com>
Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
2009-08-25 12:50:27 +00:00
|
|
|
if (syscall_nr < 0)
|
|
|
|
return;
|
2009-08-10 20:52:47 +00:00
|
|
|
if (!test_bit(syscall_nr, enabled_enter_syscalls))
|
|
|
|
return;
|
2009-03-07 04:52:59 +00:00
|
|
|
|
2009-03-13 14:42:11 +00:00
|
|
|
sys_data = syscall_nr_to_meta(syscall_nr);
|
|
|
|
if (!sys_data)
|
|
|
|
return;
|
|
|
|
|
|
|
|
size = sizeof(*entry) + sizeof(unsigned long) * sys_data->nb_args;
|
|
|
|
|
2009-12-01 08:23:36 +00:00
|
|
|
event = trace_current_buffer_lock_reserve(&buffer,
|
|
|
|
sys_data->enter_event->id, size, 0, 0);
|
2009-03-13 14:42:11 +00:00
|
|
|
if (!event)
|
|
|
|
return;
|
|
|
|
|
|
|
|
entry = ring_buffer_event_data(event);
|
|
|
|
entry->nr = syscall_nr;
|
|
|
|
syscall_get_arguments(current, regs, 0, sys_data->nb_args, entry->args);
|
|
|
|
|
2009-09-02 18:17:06 +00:00
|
|
|
if (!filter_current_check_discard(buffer, sys_data->enter_event,
|
|
|
|
entry, event))
|
|
|
|
trace_current_buffer_unlock_commit(buffer, event, 0, 0);
|
2009-03-07 04:52:59 +00:00
|
|
|
}
|
|
|
|
|
2009-08-10 20:52:47 +00:00
|
|
|
void ftrace_syscall_exit(struct pt_regs *regs, long ret)
|
2009-03-07 04:52:59 +00:00
|
|
|
{
|
2009-03-13 14:42:11 +00:00
|
|
|
struct syscall_trace_exit *entry;
|
|
|
|
struct syscall_metadata *sys_data;
|
|
|
|
struct ring_buffer_event *event;
|
2009-09-02 18:17:06 +00:00
|
|
|
struct ring_buffer *buffer;
|
2009-03-07 04:52:59 +00:00
|
|
|
int syscall_nr;
|
|
|
|
|
|
|
|
syscall_nr = syscall_get_nr(current, regs);
|
tracing: Check invalid syscall nr while tracing syscalls
Most arch syscall_get_nr() implementations returns -1 if the syscall
number is not valid. Accessing the bit field without a check might
result in a kernel oops (at least I saw it on s390 for ftrace selftest).
Before this change, this problem did not occur, because the invalid
syscall number (-1) caused syscall_nr_to_meta() to return NULL.
There are at least two scenarios where syscall_get_nr() can return -1:
1. For example, ptrace stores an invalid syscall number, and thus,
tracing code resets it.
(see do_syscall_trace_enter in arch/s390/kernel/ptrace.c)
2. The syscall_regfunc() (kernel/tracepoint.c) sets the
TIF_SYSCALL_FTRACE (now: TIF_SYSCALL_TRACEPOINT) flag for all threads
which include kernel threads.
However, the ftrace selftest triggers a kernel oops when testing
syscall trace points:
- The kernel thread is started as ususal (do_fork()),
- tracing code sets TIF_SYSCALL_FTRACE,
- the ret_from_fork() function is triggered and starts
ftrace_syscall_exit() with an invalid syscall number.
To avoid these scenarios, I suggest to check the syscall_nr.
For instance, the ftrace selftest fails for s390 (with config option
CONFIG_FTRACE_SYSCALLS set) and produces the following kernel oops.
Unable to handle kernel pointer dereference at virtual kernel address 2000000000
Oops: 0038 [#1] PREEMPT SMP
Modules linked in:
CPU: 0 Not tainted 2.6.31-rc6-next-20090819-dirty #18
Process kthreadd (pid: 818, task: 000000003ea207e8, ksp: 000000003e813eb8)
Krnl PSW : 0704100180000000 00000000000ea54c (ftrace_syscall_exit+0x58/0xdc)
R:0 T:1 IO:1 EX:1 Key:0 M:1 W:0 P:0 AS:0 CC:1 PM:0 EA:3
Krnl GPRS: 0000000000000000 00000000000e0000 ffffffffffffffff 20000000008c2650
0000000000000007 0000000000000000 0000000000000000 0000000000000000
0000000000000000 0000000000000000 ffffffffffffffff 000000003e813d78
000000003e813f58 0000000000505ba8 000000003e813e18 000000003e813d78
Krnl Code: 00000000000ea540: e330d0000008 ag %r3,0(%r13)
00000000000ea546: a7480007 lhi %r4,7
00000000000ea54a: 1442 nr %r4,%r2
>00000000000ea54c: e31030000090 llgc %r1,0(%r3)
00000000000ea552: 5410d008 n %r1,8(%r13)
00000000000ea556: 8a104000 sra %r1,0(%r4)
00000000000ea55a: 5410d00c n %r1,12(%r13)
00000000000ea55e: 1211 ltr %r1,%r1
Call Trace:
([<0000000000000000>] 0x0)
[<000000000001fa22>] do_syscall_trace_exit+0x132/0x18c
[<000000000002d0c4>] sysc_return+0x0/0x8
[<000000000001c738>] kernel_thread_starter+0x0/0xc
Last Breaking-Event-Address:
[<00000000000ea51e>] ftrace_syscall_exit+0x2a/0xdc
Signed-off-by: Hendrik Brueckner <brueckner@linux.vnet.ibm.com>
Acked-by: Heiko Carstens <heiko.carstens@de.ibm.com>
Cc: Jason Baron <jbaron@redhat.com>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: Lai Jiangshan <laijs@cn.fujitsu.com>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Mathieu Desnoyers <mathieu.desnoyers@polymtl.ca>
Cc: Jiaying Zhang <jiayingz@google.com>
Cc: Martin Bligh <mbligh@google.com>
Cc: Li Zefan <lizf@cn.fujitsu.com>
Cc: Martin Schwidefsky <schwidefsky@de.ibm.com>
Cc: Paul Mundt <lethal@linux-sh.org>
LKML-Reference: <20090825125027.GE4639@cetus.boeblingen.de.ibm.com>
Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
2009-08-25 12:50:27 +00:00
|
|
|
if (syscall_nr < 0)
|
|
|
|
return;
|
2009-08-10 20:52:47 +00:00
|
|
|
if (!test_bit(syscall_nr, enabled_exit_syscalls))
|
|
|
|
return;
|
2009-03-07 04:52:59 +00:00
|
|
|
|
2009-03-13 14:42:11 +00:00
|
|
|
sys_data = syscall_nr_to_meta(syscall_nr);
|
|
|
|
if (!sys_data)
|
|
|
|
return;
|
|
|
|
|
2009-12-01 08:23:36 +00:00
|
|
|
event = trace_current_buffer_lock_reserve(&buffer,
|
|
|
|
sys_data->exit_event->id, sizeof(*entry), 0, 0);
|
2009-03-13 14:42:11 +00:00
|
|
|
if (!event)
|
|
|
|
return;
|
|
|
|
|
|
|
|
entry = ring_buffer_event_data(event);
|
|
|
|
entry->nr = syscall_nr;
|
|
|
|
entry->ret = syscall_get_return_value(current, regs);
|
|
|
|
|
2009-09-02 18:17:06 +00:00
|
|
|
if (!filter_current_check_discard(buffer, sys_data->exit_event,
|
|
|
|
entry, event))
|
|
|
|
trace_current_buffer_unlock_commit(buffer, event, 0, 0);
|
2009-03-07 04:52:59 +00:00
|
|
|
}
|
|
|
|
|
2009-08-13 20:34:53 +00:00
|
|
|
int reg_event_syscall_enter(struct ftrace_event_call *call)
|
2009-03-07 04:52:59 +00:00
|
|
|
{
|
2009-08-10 20:52:47 +00:00
|
|
|
int ret = 0;
|
|
|
|
int num;
|
|
|
|
|
2009-12-01 08:23:47 +00:00
|
|
|
num = ((struct syscall_metadata *)call->data)->syscall_nr;
|
2009-08-24 21:40:22 +00:00
|
|
|
if (num < 0 || num >= NR_syscalls)
|
2009-08-10 20:52:47 +00:00
|
|
|
return -ENOSYS;
|
|
|
|
mutex_lock(&syscall_trace_lock);
|
|
|
|
if (!sys_refcount_enter)
|
2009-08-24 21:43:14 +00:00
|
|
|
ret = register_trace_sys_enter(ftrace_syscall_enter);
|
2009-12-08 03:14:52 +00:00
|
|
|
if (!ret) {
|
2009-08-10 20:52:47 +00:00
|
|
|
set_bit(num, enabled_enter_syscalls);
|
|
|
|
sys_refcount_enter++;
|
|
|
|
}
|
|
|
|
mutex_unlock(&syscall_trace_lock);
|
|
|
|
return ret;
|
2009-03-07 04:52:59 +00:00
|
|
|
}
|
|
|
|
|
2009-08-13 20:34:53 +00:00
|
|
|
void unreg_event_syscall_enter(struct ftrace_event_call *call)
|
2009-03-07 04:52:59 +00:00
|
|
|
{
|
2009-08-10 20:52:47 +00:00
|
|
|
int num;
|
2009-03-07 04:52:59 +00:00
|
|
|
|
2009-12-01 08:23:47 +00:00
|
|
|
num = ((struct syscall_metadata *)call->data)->syscall_nr;
|
2009-08-24 21:40:22 +00:00
|
|
|
if (num < 0 || num >= NR_syscalls)
|
2009-08-10 20:52:47 +00:00
|
|
|
return;
|
|
|
|
mutex_lock(&syscall_trace_lock);
|
|
|
|
sys_refcount_enter--;
|
|
|
|
clear_bit(num, enabled_enter_syscalls);
|
|
|
|
if (!sys_refcount_enter)
|
2009-08-24 21:43:14 +00:00
|
|
|
unregister_trace_sys_enter(ftrace_syscall_enter);
|
2009-08-10 20:52:47 +00:00
|
|
|
mutex_unlock(&syscall_trace_lock);
|
|
|
|
}
|
2009-03-07 04:52:59 +00:00
|
|
|
|
2009-08-13 20:34:53 +00:00
|
|
|
int reg_event_syscall_exit(struct ftrace_event_call *call)
|
2009-03-07 04:52:59 +00:00
|
|
|
{
|
2009-08-10 20:52:47 +00:00
|
|
|
int ret = 0;
|
|
|
|
int num;
|
|
|
|
|
2009-12-01 08:23:47 +00:00
|
|
|
num = ((struct syscall_metadata *)call->data)->syscall_nr;
|
2009-08-24 21:40:22 +00:00
|
|
|
if (num < 0 || num >= NR_syscalls)
|
2009-08-10 20:52:47 +00:00
|
|
|
return -ENOSYS;
|
|
|
|
mutex_lock(&syscall_trace_lock);
|
|
|
|
if (!sys_refcount_exit)
|
2009-08-24 21:43:14 +00:00
|
|
|
ret = register_trace_sys_exit(ftrace_syscall_exit);
|
2009-12-08 03:14:52 +00:00
|
|
|
if (!ret) {
|
2009-08-10 20:52:47 +00:00
|
|
|
set_bit(num, enabled_exit_syscalls);
|
|
|
|
sys_refcount_exit++;
|
2009-03-07 04:52:59 +00:00
|
|
|
}
|
2009-08-10 20:52:47 +00:00
|
|
|
mutex_unlock(&syscall_trace_lock);
|
|
|
|
return ret;
|
|
|
|
}
|
2009-03-07 04:52:59 +00:00
|
|
|
|
2009-08-13 20:34:53 +00:00
|
|
|
void unreg_event_syscall_exit(struct ftrace_event_call *call)
|
2009-08-10 20:52:47 +00:00
|
|
|
{
|
|
|
|
int num;
|
2009-03-07 04:52:59 +00:00
|
|
|
|
2009-12-01 08:23:47 +00:00
|
|
|
num = ((struct syscall_metadata *)call->data)->syscall_nr;
|
2009-08-24 21:40:22 +00:00
|
|
|
if (num < 0 || num >= NR_syscalls)
|
2009-08-10 20:52:47 +00:00
|
|
|
return;
|
|
|
|
mutex_lock(&syscall_trace_lock);
|
|
|
|
sys_refcount_exit--;
|
|
|
|
clear_bit(num, enabled_exit_syscalls);
|
|
|
|
if (!sys_refcount_exit)
|
2009-08-24 21:43:14 +00:00
|
|
|
unregister_trace_sys_exit(ftrace_syscall_exit);
|
2009-08-10 20:52:47 +00:00
|
|
|
mutex_unlock(&syscall_trace_lock);
|
2009-03-07 04:52:59 +00:00
|
|
|
}
|
2009-08-10 20:52:47 +00:00
|
|
|
|
2009-12-01 08:23:55 +00:00
|
|
|
int init_syscall_trace(struct ftrace_event_call *call)
|
|
|
|
{
|
|
|
|
int id;
|
|
|
|
|
2009-12-15 07:39:45 +00:00
|
|
|
if (set_syscall_print_fmt(call) < 0)
|
|
|
|
return -ENOMEM;
|
|
|
|
|
2009-12-29 02:13:59 +00:00
|
|
|
id = trace_event_raw_init(call);
|
|
|
|
|
|
|
|
if (id < 0) {
|
2009-12-15 07:39:45 +00:00
|
|
|
free_syscall_print_fmt(call);
|
2009-12-29 02:13:59 +00:00
|
|
|
return id;
|
2009-12-15 07:39:45 +00:00
|
|
|
}
|
2009-12-29 02:13:59 +00:00
|
|
|
|
|
|
|
return id;
|
2009-12-01 08:23:55 +00:00
|
|
|
}
|
|
|
|
|
2010-01-26 09:40:03 +00:00
|
|
|
unsigned long __init arch_syscall_addr(int nr)
|
|
|
|
{
|
|
|
|
return (unsigned long)sys_call_table[nr];
|
|
|
|
}
|
|
|
|
|
2009-09-19 04:50:42 +00:00
|
|
|
int __init init_ftrace_syscalls(void)
|
|
|
|
{
|
|
|
|
struct syscall_metadata *meta;
|
|
|
|
unsigned long addr;
|
|
|
|
int i;
|
|
|
|
|
|
|
|
syscalls_metadata = kzalloc(sizeof(*syscalls_metadata) *
|
|
|
|
NR_syscalls, GFP_KERNEL);
|
|
|
|
if (!syscalls_metadata) {
|
|
|
|
WARN_ON(1);
|
|
|
|
return -ENOMEM;
|
|
|
|
}
|
|
|
|
|
|
|
|
for (i = 0; i < NR_syscalls; i++) {
|
|
|
|
addr = arch_syscall_addr(i);
|
|
|
|
meta = find_syscall_meta(addr);
|
2009-12-01 08:23:47 +00:00
|
|
|
if (!meta)
|
|
|
|
continue;
|
|
|
|
|
|
|
|
meta->syscall_nr = i;
|
2009-09-19 04:50:42 +00:00
|
|
|
syscalls_metadata[i] = meta;
|
|
|
|
}
|
|
|
|
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
core_initcall(init_ftrace_syscalls);
|
|
|
|
|
2009-12-21 06:27:35 +00:00
|
|
|
#ifdef CONFIG_PERF_EVENTS
|
2009-08-11 18:22:53 +00:00
|
|
|
|
2010-03-05 04:35:37 +00:00
|
|
|
static DECLARE_BITMAP(enabled_perf_enter_syscalls, NR_syscalls);
|
|
|
|
static DECLARE_BITMAP(enabled_perf_exit_syscalls, NR_syscalls);
|
|
|
|
static int sys_perf_refcount_enter;
|
|
|
|
static int sys_perf_refcount_exit;
|
2009-08-10 20:53:02 +00:00
|
|
|
|
2010-03-05 04:35:37 +00:00
|
|
|
static void perf_syscall_enter(struct pt_regs *regs, long id)
|
2009-08-10 20:53:02 +00:00
|
|
|
{
|
|
|
|
struct syscall_metadata *sys_data;
|
2009-09-18 04:10:28 +00:00
|
|
|
struct syscall_trace_enter *rec;
|
|
|
|
unsigned long flags;
|
2009-08-10 20:53:02 +00:00
|
|
|
int syscall_nr;
|
2009-11-23 10:37:29 +00:00
|
|
|
int rctx;
|
2009-08-11 18:22:53 +00:00
|
|
|
int size;
|
2009-08-10 20:53:02 +00:00
|
|
|
|
|
|
|
syscall_nr = syscall_get_nr(current, regs);
|
2010-03-05 04:35:37 +00:00
|
|
|
if (!test_bit(syscall_nr, enabled_perf_enter_syscalls))
|
2009-08-10 20:53:02 +00:00
|
|
|
return;
|
|
|
|
|
|
|
|
sys_data = syscall_nr_to_meta(syscall_nr);
|
|
|
|
if (!sys_data)
|
|
|
|
return;
|
|
|
|
|
2009-08-11 18:22:53 +00:00
|
|
|
/* get the size after alignment with the u32 buffer size field */
|
|
|
|
size = sizeof(unsigned long) * sys_data->nb_args + sizeof(*rec);
|
|
|
|
size = ALIGN(size + sizeof(u32), sizeof(u64));
|
|
|
|
size -= sizeof(u32);
|
|
|
|
|
2010-03-05 04:35:37 +00:00
|
|
|
if (WARN_ONCE(size > PERF_MAX_TRACE_SIZE,
|
|
|
|
"perf buffer not large enough"))
|
2009-09-18 04:10:28 +00:00
|
|
|
return;
|
|
|
|
|
2010-03-05 04:35:37 +00:00
|
|
|
rec = (struct syscall_trace_enter *)perf_trace_buf_prepare(size,
|
2010-01-28 01:32:29 +00:00
|
|
|
sys_data->enter_event->id, &rctx, &flags);
|
|
|
|
if (!rec)
|
|
|
|
return;
|
2009-09-18 04:10:28 +00:00
|
|
|
|
|
|
|
rec->nr = syscall_nr;
|
|
|
|
syscall_get_arguments(current, regs, 0, sys_data->nb_args,
|
|
|
|
(unsigned long *)&rec->args);
|
2010-03-05 04:35:37 +00:00
|
|
|
perf_trace_buf_submit(rec, size, rctx, 0, 1, flags, regs);
|
2009-08-10 20:53:02 +00:00
|
|
|
}
|
|
|
|
|
2010-03-05 04:35:37 +00:00
|
|
|
int perf_sysenter_enable(struct ftrace_event_call *call)
|
2009-08-10 20:53:02 +00:00
|
|
|
{
|
|
|
|
int ret = 0;
|
|
|
|
int num;
|
|
|
|
|
2009-12-01 08:24:01 +00:00
|
|
|
num = ((struct syscall_metadata *)call->data)->syscall_nr;
|
2009-08-10 20:53:02 +00:00
|
|
|
|
|
|
|
mutex_lock(&syscall_trace_lock);
|
2010-03-05 04:35:37 +00:00
|
|
|
if (!sys_perf_refcount_enter)
|
|
|
|
ret = register_trace_sys_enter(perf_syscall_enter);
|
2009-08-10 20:53:02 +00:00
|
|
|
if (ret) {
|
|
|
|
pr_info("event trace: Could not activate"
|
|
|
|
"syscall entry trace point");
|
|
|
|
} else {
|
2010-03-05 04:35:37 +00:00
|
|
|
set_bit(num, enabled_perf_enter_syscalls);
|
|
|
|
sys_perf_refcount_enter++;
|
2009-08-10 20:53:02 +00:00
|
|
|
}
|
|
|
|
mutex_unlock(&syscall_trace_lock);
|
|
|
|
return ret;
|
|
|
|
}
|
|
|
|
|
2010-03-05 04:35:37 +00:00
|
|
|
void perf_sysenter_disable(struct ftrace_event_call *call)
|
2009-08-10 20:53:02 +00:00
|
|
|
{
|
|
|
|
int num;
|
|
|
|
|
2009-12-01 08:24:01 +00:00
|
|
|
num = ((struct syscall_metadata *)call->data)->syscall_nr;
|
2009-08-10 20:53:02 +00:00
|
|
|
|
|
|
|
mutex_lock(&syscall_trace_lock);
|
2010-03-05 04:35:37 +00:00
|
|
|
sys_perf_refcount_enter--;
|
|
|
|
clear_bit(num, enabled_perf_enter_syscalls);
|
|
|
|
if (!sys_perf_refcount_enter)
|
|
|
|
unregister_trace_sys_enter(perf_syscall_enter);
|
2009-08-10 20:53:02 +00:00
|
|
|
mutex_unlock(&syscall_trace_lock);
|
|
|
|
}
|
|
|
|
|
2010-03-05 04:35:37 +00:00
|
|
|
static void perf_syscall_exit(struct pt_regs *regs, long ret)
|
2009-08-10 20:53:02 +00:00
|
|
|
{
|
|
|
|
struct syscall_metadata *sys_data;
|
2009-09-18 04:10:28 +00:00
|
|
|
struct syscall_trace_exit *rec;
|
|
|
|
unsigned long flags;
|
2009-08-10 20:53:02 +00:00
|
|
|
int syscall_nr;
|
2009-11-23 10:37:29 +00:00
|
|
|
int rctx;
|
2009-09-18 04:10:28 +00:00
|
|
|
int size;
|
2009-08-10 20:53:02 +00:00
|
|
|
|
|
|
|
syscall_nr = syscall_get_nr(current, regs);
|
2010-03-05 04:35:37 +00:00
|
|
|
if (!test_bit(syscall_nr, enabled_perf_exit_syscalls))
|
2009-08-10 20:53:02 +00:00
|
|
|
return;
|
|
|
|
|
|
|
|
sys_data = syscall_nr_to_meta(syscall_nr);
|
|
|
|
if (!sys_data)
|
|
|
|
return;
|
|
|
|
|
2009-09-18 04:10:28 +00:00
|
|
|
/* We can probably do that at build time */
|
|
|
|
size = ALIGN(sizeof(*rec) + sizeof(u32), sizeof(u64));
|
|
|
|
size -= sizeof(u32);
|
2009-08-11 18:22:53 +00:00
|
|
|
|
2009-09-18 04:10:28 +00:00
|
|
|
/*
|
|
|
|
* Impossible, but be paranoid with the future
|
|
|
|
* How to put this check outside runtime?
|
|
|
|
*/
|
2010-03-05 04:35:37 +00:00
|
|
|
if (WARN_ONCE(size > PERF_MAX_TRACE_SIZE,
|
|
|
|
"exit event has grown above perf buffer size"))
|
2009-09-18 04:10:28 +00:00
|
|
|
return;
|
|
|
|
|
2010-03-05 04:35:37 +00:00
|
|
|
rec = (struct syscall_trace_exit *)perf_trace_buf_prepare(size,
|
2010-01-28 01:32:29 +00:00
|
|
|
sys_data->exit_event->id, &rctx, &flags);
|
|
|
|
if (!rec)
|
|
|
|
return;
|
2009-09-18 04:10:28 +00:00
|
|
|
|
|
|
|
rec->nr = syscall_nr;
|
|
|
|
rec->ret = syscall_get_return_value(current, regs);
|
|
|
|
|
2010-03-05 04:35:37 +00:00
|
|
|
perf_trace_buf_submit(rec, size, rctx, 0, 1, flags, regs);
|
2009-08-10 20:53:02 +00:00
|
|
|
}
|
|
|
|
|
2010-03-05 04:35:37 +00:00
|
|
|
int perf_sysexit_enable(struct ftrace_event_call *call)
|
2009-08-10 20:53:02 +00:00
|
|
|
{
|
|
|
|
int ret = 0;
|
|
|
|
int num;
|
|
|
|
|
2009-12-01 08:24:01 +00:00
|
|
|
num = ((struct syscall_metadata *)call->data)->syscall_nr;
|
2009-08-10 20:53:02 +00:00
|
|
|
|
|
|
|
mutex_lock(&syscall_trace_lock);
|
2010-03-05 04:35:37 +00:00
|
|
|
if (!sys_perf_refcount_exit)
|
|
|
|
ret = register_trace_sys_exit(perf_syscall_exit);
|
2009-08-10 20:53:02 +00:00
|
|
|
if (ret) {
|
|
|
|
pr_info("event trace: Could not activate"
|
2010-02-24 07:40:22 +00:00
|
|
|
"syscall exit trace point");
|
2009-08-10 20:53:02 +00:00
|
|
|
} else {
|
2010-03-05 04:35:37 +00:00
|
|
|
set_bit(num, enabled_perf_exit_syscalls);
|
|
|
|
sys_perf_refcount_exit++;
|
2009-08-10 20:53:02 +00:00
|
|
|
}
|
|
|
|
mutex_unlock(&syscall_trace_lock);
|
|
|
|
return ret;
|
|
|
|
}
|
|
|
|
|
2010-03-05 04:35:37 +00:00
|
|
|
void perf_sysexit_disable(struct ftrace_event_call *call)
|
2009-08-10 20:53:02 +00:00
|
|
|
{
|
|
|
|
int num;
|
|
|
|
|
2009-12-01 08:24:01 +00:00
|
|
|
num = ((struct syscall_metadata *)call->data)->syscall_nr;
|
2009-08-10 20:53:02 +00:00
|
|
|
|
|
|
|
mutex_lock(&syscall_trace_lock);
|
2010-03-05 04:35:37 +00:00
|
|
|
sys_perf_refcount_exit--;
|
|
|
|
clear_bit(num, enabled_perf_exit_syscalls);
|
|
|
|
if (!sys_perf_refcount_exit)
|
|
|
|
unregister_trace_sys_exit(perf_syscall_exit);
|
2009-08-10 20:53:02 +00:00
|
|
|
mutex_unlock(&syscall_trace_lock);
|
|
|
|
}
|
|
|
|
|
2009-12-21 06:27:35 +00:00
|
|
|
#endif /* CONFIG_PERF_EVENTS */
|
2009-08-10 20:53:02 +00:00
|
|
|
|